0

I am trying to optimally split Python code on FaaS to improve response time.

To split the code at the optimal location, I need the execution time of each line and the size of the data on which each line depends. Is there an appropriate way to obtain these two?

The environment I am using is,

  • Azure Functions
  • Vscode
  • Python programming model v2

By the way, it may not be necessary to split them, but I am going to try to split the activity function in the code below into multiple functions.

import azure.functions as func
import azure.durable_functions as df
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing  # Dataset
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import Lasso  
from sklearn.linear_model import Ridge 
from sklearn.metrics import mean_squared_error  # MSE(Mean Squared Error)
from sklearn.preprocessing import StandardScaler 

app = df.DFApp(http_auth_level=func.AuthLevel.ANONYMOUS)
### client function ###
@app.route(route="orchestrators/client_function")
@app.durable_client_input(client_name="client")
async def client_function(req: func.HttpRequest, client: df.DurableOrchestrationClient) -> func.HttpResponse:
    instance_id = await client.start_new("orchestrator", None, {})
    await client.wait_for_completion_or_create_check_status_response(req, instance_id)
    return client.create_check_status_response(req, instance_id)

### orchestrator function ###
@app.orchestration_trigger(context_name="context")
def orchestrator(context: df.DurableOrchestrationContext) -> str:
    result = yield context.call_activity("origin_analysis", '')
    return "finished"


### activity function ###
@app.blob_output(arg_name="outputblob", path="newblob/test.txt", connection="BlobStorageConnection")
@app.activity_trigger(input_name="blank")
def origin_analysis(blank: str, outputblob: func.Out[str]):
    # prepare data
    california_housing = fetch_california_housing()

    exp_data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
    tar_data = pd.DataFrame(california_housing.target, columns=['HousingPrices'])
    data = pd.concat([exp_data, tar_data], axis=1)

    # Delete anomalous values
    data = data[data['HouseAge'] != 52]
    data = data[data['HousingPrices'] != 5.00001]

    # Create useful variables
    data['Household'] = data['Population']/data['AveOccup']
    data['AllRooms'] = data['AveRooms']*data['Household']
    data['AllBedrms'] = data['AveBedrms']*data['Household']


    ### simple regression analysis ###
    exp_var = 'MedInc'
    tar_var = 'HousingPrices'

    # Remove outliers
    q_95 = data['MedInc'].quantile(0.95)

    data = data[data['MedInc'] < q_95]

    data = data[data['MedInc'] < q_95]

    # Split data into explanatory and objective variables
    X = data[[exp_var]]
    y = data[[tar_var]]

    # learn
    model = LinearRegression()
    model.fit(X, y)

    ### multiple regression analysis ###
    exp_vars = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
    tar_var = 'HousingPrices'

    # Remove outliers
    for exp_var in exp_vars:
        q_95 = data[exp_var].quantile(0.95)
        data = data[data[exp_var] < q_95]

    # Split data into explanatory and objective variables
    X = data[exp_vars]
    y = data[[tar_var]]

    # Split into training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    #  Standardize X_train
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_train_scaled = pd.DataFrame(X_train_scaled, columns = exp_vars)

    # learn
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)

    # Calculate predicted values
    y_pred = model.predict(X_train_scaled)
    y_pred[:10]

    # MSE for test data
    X_test_scaled = scaler.transform(X_test) # Test data standardized by mean and standard deviation obtained from training data
    y_test_pred = model.predict(X_test_scaled) # Predicting against test data
    mse_test = mean_squared_error(y_test, y_test_pred)

    # Ridge regression
    ridge = Ridge(alpha=1.0)
    ridge.fit(X_train_scaled, y_train)
    ridge_y_pred = ridge.predict(X_train_scaled)

    # Checking Partial Regression Coefficients
    ridge_w = pd.DataFrame(ridge.coef_.T, index=exp_vars, columns=['Ridge'])
    for xi, wi in zip(exp_vars, ridge.coef_[0]):
        print('{0:7s}: {1:6.3f}'.format(xi, wi))
    
    # Mean Squared Error (MSE) for training data
    mse_train = mean_squared_error(y_train, y_pred)

    # Mean Squared Error (MSE) for training data
    ridge_mse_train = mean_squared_error(y_train, ridge_y_pred)

    # MSE for test data
    ridge_y_test_pred = ridge.predict(X_test_scaled) # Predicting against test data
    ridge_mse_test = mean_squared_error(y_test, ridge_y_test_pred)

    # Lasso regression
    lasso = Lasso(alpha=1.0)
    lasso.fit(X_train_scaled, y_train)
    lasso_y_pred = lasso.predict(X_train_scaled)

    # Checking Partial Regression Coefficients
    lasso_w = pd.Series(index=exp_vars, data=lasso.coef_)

    lasso_mse_train = mean_squared_error(y_train, lasso_y_pred)

    lasso_X_test_scaled = scaler.transform(X_test)
    lasso_y_pred_test = lasso.predict(lasso_X_test_scaled)
    lasso_mse_test = mean_squared_error(y_test, lasso_y_pred_test)

    # Comparison of the accuracy of multiple regression analysis with and without regularization
    data = {'Training data MSE':[mse_train, ridge_mse_train, lasso_mse_train],
            'Test Data MSE':[mse_test, ridge_mse_test, lasso_mse_test],
            'coefficient of determination':[model.score(X_test_scaled, y_test), ridge.score(X_test_scaled, y_test), lasso.score(X_test_scaled, y_test)]}
    df_mse = pd.DataFrame(data=data, index=['multiple regression', 'Ridge regression', 'Lasso regression'])

    return str(df_mse)
3
  • For the execution time, I suggest that you could use Jupyter-notebook and create a separate cell for each line of code. Commented Nov 9, 2023 at 5:44
  • Is it impossible for Google colab to recognize execution time in ms? Commented Nov 9, 2023 at 6:28
  • pretty much a dup of stackoverflow.com/q/74049834/11107541 Commented Nov 9, 2023 at 20:06

1 Answer 1

1

You can use time module to check the execution time of the code blocks in your code, I have imported time module and made changes to your code like below:-

My function_app.py

import azure.functions as func
import azure.durable_functions as df
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing  # Dataset
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import Lasso  
from sklearn.linear_model import Ridge 
from sklearn.metrics import mean_squared_error  # MSE(Mean Squared Error)
from sklearn.preprocessing import StandardScaler 
import sys
import cProfile
import time

app = df.DFApp(http_auth_level=func.AuthLevel.ANONYMOUS)
### client function ###

@app.route(route="orchestrators/client_function")
@app.durable_client_input(client_name="client")
async def client_function(req: func.HttpRequest, client: df.DurableOrchestrationClient) -> func.HttpResponse:
    instance_id = await client.start_new("orchestrator", None, {})
    await client.wait_for_completion_or_create_check_status_response(req, instance_id)
    return client.create_check_status_response(req, instance_id)
   
### orchestrator function ###
@app.orchestration_trigger(context_name="context")
def orchestrator(context: df.DurableOrchestrationContext) -> str:
    result = yield context.call_activity("origin_analysis", '')
    return "finished"


### activity function ###
@app.blob_output(arg_name="outputblob", path="newblob/test.txt", connection="BlobStorageConnection")
@app.activity_trigger(input_name="blank")
def origin_analysis(blank: str, outputblob: func.Out[str]):
    start_time = time.time()
    # prepare data
    california_housing = fetch_california_housing()

    exp_data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
    tar_data = pd.DataFrame(california_housing.target, columns=['HousingPrices'])
    data = pd.concat([exp_data, tar_data], axis=1)

    # Delete anomalous values
    data = data[data['HouseAge'] != 52]
    data = data[data['HousingPrices'] != 5.00001]

    # Create useful variables
    data['Household'] = data['Population']/data['AveOccup']
    data['AllRooms'] = data['AveRooms']*data['Household']
    data['AllBedrms'] = data['AveBedrms']*data['Household']


    ### simple regression analysis ###
    exp_var = 'MedInc'
    tar_var = 'HousingPrices'

    # Remove outliers
    q_95 = data['MedInc'].quantile(0.95)

    data = data[data['MedInc'] < q_95]

    data = data[data['MedInc'] < q_95]

    # Split data into explanatory and objective variables
    X = data[[exp_var]]
    y = data[[tar_var]]

    # learn
    model = LinearRegression()
    model.fit(X, y)

    ### multiple regression analysis ###
    exp_vars = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
    tar_var = 'HousingPrices'

    # Remove outliers
    for exp_var in exp_vars:
        q_95 = data[exp_var].quantile(0.95)
        data = data[data[exp_var] < q_95]

    # Split data into explanatory and objective variables
    X = data[exp_vars]
    y = data[[tar_var]]

    # Split into training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    #  Standardize X_train
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_train_scaled = pd.DataFrame(X_train_scaled, columns = exp_vars)

    # learn
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)

    # Calculate predicted values
    y_pred = model.predict(X_train_scaled)
    y_pred[:10]

    # MSE for test data
    X_test_scaled = scaler.transform(X_test) # Test data standardized by mean and standard deviation obtained from training data
    y_test_pred = model.predict(X_test_scaled) # Predicting against test data
    mse_test = mean_squared_error(y_test, y_test_pred)

    # Ridge regression
    ridge = Ridge(alpha=1.0)
    ridge.fit(X_train_scaled, y_train)
    ridge_y_pred = ridge.predict(X_train_scaled)

    # Checking Partial Regression Coefficients
    ridge_w = pd.DataFrame(ridge.coef_.T, index=exp_vars, columns=['Ridge'])
    for xi, wi in zip(exp_vars, ridge.coef_[0]):
        print('{0:7s}: {1:6.3f}'.format(xi, wi))
    
    # Mean Squared Error (MSE) for training data
    mse_train = mean_squared_error(y_train, y_pred)

    # Mean Squared Error (MSE) for training data
    ridge_mse_train = mean_squared_error(y_train, ridge_y_pred)

    # MSE for test data
    ridge_y_test_pred = ridge.predict(X_test_scaled) # Predicting against test data
    ridge_mse_test = mean_squared_error(y_test, ridge_y_test_pred)

    # Lasso regression
    lasso = Lasso(alpha=1.0)
    lasso.fit(X_train_scaled, y_train)
    lasso_y_pred = lasso.predict(X_train_scaled)

    # Checking Partial Regression Coefficients
    lasso_w = pd.Series(index=exp_vars, data=lasso.coef_)

    lasso_mse_train = mean_squared_error(y_train, lasso_y_pred)

    lasso_X_test_scaled = scaler.transform(X_test)
    lasso_y_pred_test = lasso.predict(lasso_X_test_scaled)
    lasso_mse_test = mean_squared_error(y_test, lasso_y_pred_test)

    # Comparison of the accuracy of multiple regression analysis with and without regularization
    data = {'Training data MSE':[mse_train, ridge_mse_train, lasso_mse_train],
            'Test Data MSE':[mse_test, ridge_mse_test, lasso_mse_test],
            'coefficient of determination':[model.score(X_test_scaled, y_test), ridge.score(X_test_scaled, y_test), lasso.score(X_test_scaled, y_test)]}
    df_mse = pd.DataFrame(data=data, index=['multiple regression', 'Ridge regression', 'Lasso regression'])
    
    end_time = time.time()
    execution_time = end_time - start_time
    # Measure the size of 'data' here
    data_size = sys.getsizeof(data)
    print(f"Execution time: {execution_time} seconds")
    print(f"Size of 'data': {data_size} bytes")
    return str(df_mse)
    # test = cProfile.run(origin_analysis)
    # print(test)

Output:-

enter image description here

enter image description here

You can also make use of cProfile to get the execution time of each code block, Here's the function_app.py code utilizing cProfile:-

function_app.py:-

import azure.functions as func
import azure.durable_functions as df
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing  # Dataset
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import Lasso  
from sklearn.linear_model import Ridge 
from sklearn.metrics import mean_squared_error  # MSE(Mean Squared Error)
from sklearn.preprocessing import StandardScaler 
import sys
import cProfile
import time

app = df.DFApp(http_auth_level=func.AuthLevel.ANONYMOUS)
### client function ###

@app.route(route="orchestrators/client_function")
@app.durable_client_input(client_name="client")
async def client_function(req: func.HttpRequest, client: df.DurableOrchestrationClient) -> func.HttpResponse:
    instance_id = await client.start_new("orchestrator", None, {})
    await client.wait_for_completion_or_create_check_status_response(req, instance_id)
    return client.create_check_status_response(req, instance_id)
   
### orchestrator function ###
@app.orchestration_trigger(context_name="context")
def orchestrator(context: df.DurableOrchestrationContext) -> str:
    result = yield context.call_activity("origin_analysis", '')
    return "finished"


### activity function ###
@app.blob_output(arg_name="outputblob", path="newblob/test.txt", connection="BlobStorageConnection")
@app.activity_trigger(input_name="blank")
def origin_analysis(blank: str, outputblob: func.Out[str]):
    start_time = time.time()
    # prepare data
    california_housing = fetch_california_housing()


    exp_data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
    tar_data = pd.DataFrame(california_housing.target, columns=['HousingPrices'])
    data = pd.concat([exp_data, tar_data], axis=1)

    # Delete anomalous values
    data = data[data['HouseAge'] != 52]
    data = data[data['HousingPrices'] != 5.00001]

    # Create useful variables
    data['Household'] = data['Population']/data['AveOccup']
    data['AllRooms'] = data['AveRooms']*data['Household']
    data['AllBedrms'] = data['AveBedrms']*data['Household']


    ### simple regression analysis ###
    exp_var = 'MedInc'
    tar_var = 'HousingPrices'

    # Remove outliers
    q_95 = data['MedInc'].quantile(0.95)

    data = data[data['MedInc'] < q_95]

    data = data[data['MedInc'] < q_95]

    # Split data into explanatory and objective variables
    X = data[[exp_var]]
    y = data[[tar_var]]

    # learn
    model = LinearRegression()
    model.fit(X, y)

    ### multiple regression analysis ###
    exp_vars = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
    tar_var = 'HousingPrices'

    # Remove outliers
    for exp_var in exp_vars:
        q_95 = data[exp_var].quantile(0.95)
        data = data[data[exp_var] < q_95]

    # Split data into explanatory and objective variables
    X = data[exp_vars]
    y = data[[tar_var]]

    # Split into training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    #  Standardize X_train
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_train_scaled = pd.DataFrame(X_train_scaled, columns = exp_vars)

    # learn
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)

    # Calculate predicted values
    y_pred = model.predict(X_train_scaled)
    y_pred[:10]

    # MSE for test data
    X_test_scaled = scaler.transform(X_test) # Test data standardized by mean and standard deviation obtained from training data
    y_test_pred = model.predict(X_test_scaled) # Predicting against test data
    mse_test = mean_squared_error(y_test, y_test_pred)

    # Ridge regression
    ridge = Ridge(alpha=1.0)
    ridge.fit(X_train_scaled, y_train)
    ridge_y_pred = ridge.predict(X_train_scaled)

    # Checking Partial Regression Coefficients
    ridge_w = pd.DataFrame(ridge.coef_.T, index=exp_vars, columns=['Ridge'])
    for xi, wi in zip(exp_vars, ridge.coef_[0]):
        print('{0:7s}: {1:6.3f}'.format(xi, wi))
    
    # Mean Squared Error (MSE) for training data
    mse_train = mean_squared_error(y_train, y_pred)

    # Mean Squared Error (MSE) for training data
    ridge_mse_train = mean_squared_error(y_train, ridge_y_pred)

    # MSE for test data
    ridge_y_test_pred = ridge.predict(X_test_scaled) # Predicting against test data
    ridge_mse_test = mean_squared_error(y_test, ridge_y_test_pred)

    # Lasso regression
    lasso = Lasso(alpha=1.0)
    lasso.fit(X_train_scaled, y_train)
    lasso_y_pred = lasso.predict(X_train_scaled)

    # Checking Partial Regression Coefficients
    lasso_w = pd.Series(index=exp_vars, data=lasso.coef_)

    lasso_mse_train = mean_squared_error(y_train, lasso_y_pred)

    lasso_X_test_scaled = scaler.transform(X_test)
    lasso_y_pred_test = lasso.predict(lasso_X_test_scaled)
    lasso_mse_test = mean_squared_error(y_test, lasso_y_pred_test)
    profiler = cProfile.Profile()
    profiler.enable()

    # Comparison of the accuracy of multiple regression analysis with and without regularization
    data = {'Training data MSE':[mse_train, ridge_mse_train, lasso_mse_train],
            'Test Data MSE':[mse_test, ridge_mse_test, lasso_mse_test],
            'coefficient of determination':[model.score(X_test_scaled, y_test), ridge.score(X_test_scaled, y_test), lasso.score(X_test_scaled, y_test)]}
    df_mse = pd.DataFrame(data=data, index=['multiple regression', 'Ridge regression', 'Lasso regression'])
    profiler.disable()
    profiler.print_stats(sort='cumtime')
    end_time = time.time()
    execution_time = end_time - start_time
    # Measure the size of 'data' here
    data_size = sys.getsizeof(data)
    print(f"Execution time: {execution_time} seconds")
    print(f"Size of 'data': {data_size} bytes")
    return str(df_mse)

Output:-

enter image description here

enter image description here

Sign up to request clarification or add additional context in comments.

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.