Here my http_trigger test function seems to run smoothly and is able to write files as expected to my azure blob storage, however when I test extract_signals which uses now a full blown headless chrome browser and using ChromeDriverManager().install() to dynamically install a chromedriver and gets the necessary credentials using DefaultAzureCredential() to access my azure key vault the function fails to run and gives me a internal server 500 error. I don't know how to troubleshoot this as the only information the application insights show me is the ff.
Connected! You are now viewing logs of Function runs in the current Code + Test panel. To see all the logs for this Function, please go to 'Logs' from the Function menu.
2025-07-28T04:59:31Z [Verbose] AuthenticationScheme: WebJobsAuthLevel was successfully authenticated.
2025-07-28T04:59:31Z [Verbose] Authorization was successful.
2025-07-28T04:59:31Z [Information] Executing 'Functions.extract_signals' (Reason='This function was programmatically called via the host APIs.', Id=5e2ad53f-b0ca-4349-b470-c654ab4c8f2c)
2025-07-28T04:59:31Z [Verbose] Sending invocation id: '5e2ad53f-b0ca-4349-b470-c654ab4c8f2c
2025-07-28T04:59:31Z [Verbose] Posting invocation id:5e2ad53f-b0ca-4349-b470-c654ab4c8f2c on workerId:d9a24210-7c41-47c1-8649-ce95814f013f
2025-07-28T04:59:31Z [Information] ====== WebDriver manager ======
2025-07-28T04:59:31Z [Information] Get LATEST chromedriver version for google-chrome
2025-07-28T04:59:32Z [Information] About to download new driver from https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip
2025-07-28T04:59:32Z [Information] Driver downloading response is 200
2025-07-28T04:59:32Z [Information] Get LATEST chromedriver version for google-chrome
2025-07-28T04:59:32Z [Information] Get LATEST chromedriver version for google-chrome
2025-07-28T04:59:32Z [Information] Driver has been saved in cache [/home/.wdm/drivers/chromedriver/linux64/114.0.5735.90]
2025-07-28T04:59:32Z [Error] Executed 'Functions.extract_signals' (Failed, Id=5e2ad53f-b0ca-4349-b470-c654ab4c8f2c, Duration=700ms)
Here I can't pinpoint exactly what line my function_app.py goes wrong. But here is the contents of my requirements.txt file which lists the necessary packages needed by my function app
attrs==25.3.0
azure-core==1.35.0
azure-functions==1.23.0
azure-identity==1.23.1
azure-keyvault==4.2.0
azure-keyvault-certificates==4.10.0
azure-keyvault-keys==4.11.0
azure-keyvault-secrets==4.10.0
azure-storage-blob==12.26.0
azure-storage-file-datalake==12.21.0
beautifulsoup4==4.13.4
certifi==2025.7.14
cffi==1.17.1
charset-normalizer==3.4.2
cryptography==45.0.5
h11==0.16.0
idna==3.10
isodate==0.7.2
MarkupSafe==3.0.2
msal==1.32.3
msal-extensions==1.3.1
numpy==2.2.5
outcome==1.3.0.post0
packaging==25.0
pandas==2.2.3
pip==23.2.1
pycparser==2.22
PyJWT==2.10.1
PySocks==1.7.1
python-dateutil==2.9.0.post0
python-dotenv==1.1.0
pytz==2025.2
requests==2.32.3
selenium==4.32.0
setuptools==65.5.0
six==1.17.0
sniffio==1.3.1
sortedcontainers==2.4.0
soupsieve==2.7
trio==0.30.0
trio-websocket==0.12.2
typing_extensions==4.14.1
tzdata==2025.2
urllib3==2.5.0
webdriver-manager==4.0.2
websocket-client==1.8.0
Werkzeug==3.1.3
wsproto==1.2.0
Here is the list of files my function app has when its deployed
and here is my full code of my function_app.py containing the functions http_trigger and extract_signals
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from azure.identity import DefaultAzureCredential, ClientSecretCredential
from azure.storage.blob import (
BlobServiceClient,
ContainerClient,
BlobClient,
BlobSasPermissions,
ContainerSasPermissions,
AccountSasPermissions,
Services,
ResourceTypes,
UserDelegationKey,
generate_account_sas,
generate_container_sas,
generate_blob_sas,
)
from azure.keyvault.secrets import SecretClient
from datetime import datetime, timedelta
from argparse import ArgumentParser
from concurrent.futures import ThreadPoolExecutor
from dotenv import load_dotenv
from pathlib import Path
from urllib.parse import urlencode
import time
import requests
import os
import azure.functions as func
import logging
import json
# # this is strictly used only in development
# # load env variables
# env_dir = Path('../').resolve()
# load_dotenv(os.path.join(env_dir, '.env'))
app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION)
def batch_signal_files_lookup(data: list, batch_size: int):
"""
returns an iterator with a json object representing the
all the base url and the relative url of the compressed
audio recording/signal of a subject in an http resource
"""
for i in range(0, len(data), batch_size):
# get current batch of data
curr_batch = data[i:i + batch_size]
# construct the lookup dictionary that will be
# uploaded as json to adls2
curr_batch_signal_files_lookup = [
{
"BaseURL": download_link,
"RelativeURL": download_link.split("/")[-1],
"FileName": download_link.split("/")[-1],
} for download_link in curr_batch
]
# convert dicitonary to json
curr_batch_signal_files_lookup_json = json.dumps(
curr_batch_signal_files_lookup,
indent=4
).encode("utf-8")
# yield the json object
yield curr_batch_signal_files_lookup_json
@app.route(route="http_trigger")
def http_trigger(req: func.HttpRequest) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
# we need to set the environment variables in our function
# app so that when DefaultAzureCredentials() runs it loads
# the env variables in our azure function app service
# as credentials. This must include the name of the secret
# we want to access set to a value @Microsoft.KeyVault(SecretUri=<copied-value>)
# where `<copied value>` here is actually the secred identifier we
# copied when we created our secret key and value pair azure key
# vault. If this is not set even if azure key vault hgas an access
# policy that grants the azure function to access it it will result
# in a internal server 500 error
credential = DefaultAzureCredential()
#
secret_client = SecretClient(
vault_url="https://sgppipelinekv.vault.azure.net",
credential=credential
)
test = secret_client.get_secret('test')
storage_account_name = secret_client.get_secret("StorageAccountName")
# create client with generated sas token
blob_service_client = BlobServiceClient(
account_url=f"https://{storage_account_name.value}.blob.core.windows.net",
credential=credential
)
# retrieves container client to retrieve blob client
# writes json file to the selected container
misc_container_client = blob_service_client.get_container_client(f"{storage_account_name.value}-miscellaneous")
# create test dicitonary to convert to json object
test = [
{
"BaseURL": "http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/1028-20100710-hne.tgz",
"RelativeURL": "1028-20100710-hne.tgz",
"FileName": "1028-20100710-hne.tgz"
},
{
"BaseURL": "http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/1337ad-20170321-ajg.tgz",
"RelativeURL": "1337ad-20170321-ajg.tgz",
"FileName": "1337ad-20170321-ajg.tgz"
}
]
test_json = json.dumps(test, indent=4).encode("utf-8")
# create file in blob container and upload the json object
test_client = misc_container_client.get_blob_client("signal_files_lookup_test.json")
test_client.upload_blob(test_json, overwrite=True)
# # listing containers and blobs
# for container in blob_service_client.list_containers():
# for blob in blob_service_client.get_container_client(container.name).list_blobs():
# print(blob.name)
# if there is a passed parameter get its value
name = req.params.get('name')
if not name:
try:
req_body = req.get_json()
except ValueError:
pass
else:
name = req_body.get('name')
if name:
return func.HttpResponse(f"Hello {name}, your HTTP triggered function wrote test.json to storage container {storage_account_name.value}.")
else:
return func.HttpResponse(
f"Hello, your HTTP triggered function wrote test.json to storage container {storage_account_name.value}.",
status_code=200
)
@app.route(route="extract_signals")
def extract_signals(req: func.HttpRequest) -> func.HttpResponse:
# define chrome options
chrome_options = ChromeOptions()
chrome_options.add_experimental_option('detach', True)
# arguments
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage")
service = ChromeService(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.get('http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/')
# wait 5 seconds for page load
time.sleep(5)
# scrolls down to very bottom
driver.execute_script("window.scrollBy(0, document.body.scrollHeight)")
# extracts all anchor tags in page
anchor_tags = driver.find_elements(By.TAG_NAME, "a")
def helper(a_tag):
# this will extract the href of all acnhor tags
link = a_tag.get_attribute('href')
return link
# concurrently read and load all .tgz files
with ThreadPoolExecutor() as exe:
links = list(exe.map(helper, anchor_tags))
# exclude all hrefs without .tgz extension
# http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/1028-20100710-hne.tgz
download_links = list(filter(lambda link: link.endswith('.tgz'), links))
batched_signal_files_lookup_jsons = batch_signal_files_lookup(download_links, batch_size=5000)
# get number of downloads
n_links = len(download_links)
# check if a parameter has been entered in the URL
RANGE = req.params.get('range')
if not RANGE:
try:
req_body = req.get_json()
except ValueError:
pass
else:
RANGE = req_body.get('range')
RANGE = n_links if not RANGE else int(RANGE)
# once deployed to azure function app environment
# DefaultAzureCredential() retrieves the azure functions
# managed identity we created when we enabled system
# assigned managed identity which assigns an object id
# to our azure function app and in which we permitted this
# object id in our azure key vault and azure blob storage
# to have access to these resources. This object id is what
# we use to access these resources
credential = DefaultAzureCredential()
# pass this as credential to secret client as well
# as our blob storage client later
secret_client = SecretClient(
vault_url="https://sgppipelinekv.vault.azure.net",
credential=credential
)
# load secret keys from key vault
test = secret_client.get_secret('test')
storage_account_name = secret_client.get_secret('storage-account-name')
# resource_group_name = secret_client.get_secret('resource-group-name')
# # Retrieve credentials from environment variables
# storage_account_name = os.environ.get("STORAGE_ACCOUNT_NAME")
# storage_account_key = os.environ.get("STORAGE_ACCOUNT_KEY")
# account_sas_kwargs = {
# "account_name": storage_account_name,
# "account_key": storage_account_key,
# "services": Services(blob=True, queue=True, fileshare=True),
# "resource_types": ResourceTypes(
# service=True,
# container=True,
# object=True
# ),
# "permission": AccountSasPermissions(
# read=True,
# write=True,
# delete=True,
# list=True,
# add=True,
# create=True,
# update=True,
# process=True
# ),
# "start": datetime.utcnow(),
# "expiry": datetime.utcnow() + timedelta(days=1)
# }
# # generated sas token is at the level of the storage account,
# # permitting services like blobs, files, queues, and tables
# # to be read, listed, retrieved, updated, deleted etc.
# # where allowed resource types are service, container
# sas_token = generate_account_sas(**account_sas_kwargs)
# begin writing files in blob storage
try:
# create client with generated sas token
blob_service_client = BlobServiceClient(
account_url=f"https://{storage_account_name.value}.blob.core.windows.net",
credential=credential
)
# retrieves container client to retrieve blob client
misc_container_client = blob_service_client.get_container_client(f"{storage_account_name.value}-miscellaneous")
# using newly created blob client we upload the json
# object as a file. There are 6321 items of these urls
# in total
for i, batch in enumerate(batched_signal_files_lookup_jsons):
signal_files_lookup_client = misc_container_client.get_blob_client(f"signal_files_lookup_{i + 1}.json")
signal_files_lookup_client.upload_blob(batch, overwrite=True)
except Exception as e:
print(f"Error operating on blobs: {e}")
return func.HttpResponse(
f"This HTTP triggered function extracted {n_links} audio signals successfully to storage account {storage_account_name.value}",
status_code=200
)
Where could the problem lie here? Is it the chromedriver installation do I need to download a binary itself and deploy it locally and have my function access it there? Any feedback would help
