#Imports
import math
import seaborn as sns
import statsmodels.api as sm
import sys
sys.path.append('/home/ubuntu/workplace/python_notebook/cost_model')
import experiment as exp
from config import DatabaseConfig
from database_instance import DatabaseInstance
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
database_config = DatabaseConfig(connection_string='mongodb://localhost',
database_name='abt_calibration_equal_interval', dump_path='',
restore_from_dump=False, dump_on_exit=False)
database = DatabaseInstance(database_config)
df = await exp.load_calibration_data(database, 'calibrationDataEqualInterval')
df.describe()
total_execution_time | |
---|---|
count | 4.500000e+03 |
mean | 4.504779e+05 |
std | 5.744749e+05 |
min | 4.152000e+03 |
25% | 1.559245e+05 |
50% | 2.913080e+05 |
75% | 4.054750e+05 |
max | 2.389553e+06 |
noout_df = exp.remove_outliers(df, 0.0, 0.90)
noout_df.describe()
total_execution_time | |
---|---|
count | 4.050000e+03 |
mean | 4.489712e+05 |
std | 5.742093e+05 |
min | 4.152000e+03 |
25% | 1.557142e+05 |
50% | 2.899445e+05 |
75% | 4.019805e+05 |
max | 2.388043e+06 |
abt_df = exp.extract_abt_nodes(noout_df)
abt_df.count()
abt_type 20250 execution_time 20250 n_returned 20250 n_processed 20250 keys_length_in_bytes 20250 average_document_size_in_bytes 20250 run_id 20250 dtype: int64
abt_df.abt_type.unique()
array(['IndexScan', 'Root', 'BinaryJoin', 'LimitSkip', 'Seek'], dtype=object)
idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
idx_scan_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | run_id | |
---|---|---|---|---|---|---|---|
0 | IndexScan | 141 | 1026 | 1027 | 688 | 1065 | 633c14def8390e87e2f862a8 |
5 | IndexScan | 128 | 1026 | 1027 | 688 | 1065 | 633c14def8390e87e2f862a8 |
10 | IndexScan | 122 | 1026 | 1027 | 688 | 1065 | 633c14def8390e87e2f862a8 |
15 | IndexScan | 117 | 1026 | 1027 | 688 | 1065 | 633c14def8390e87e2f862a8 |
20 | IndexScan | 117 | 1026 | 1027 | 688 | 1065 | 633c14def8390e87e2f862a8 |
idx_scan_df.describe()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | |
---|---|---|---|---|---|
count | 4050.000000 | 4.050000e+03 | 4.050000e+03 | 4050.000000 | 4050.000000 |
mean | 3563.027901 | 2.287778e+05 | 2.287788e+05 | 453.388889 | 959.777778 |
std | 3237.451717 | 3.700726e+05 | 3.700726e+05 | 303.412025 | 297.650137 |
min | 45.000000 | 9.500000e+02 | 9.510000e+02 | 3.000000 | 118.000000 |
25% | 702.250000 | 5.662300e+04 | 5.662400e+04 | 158.000000 | 1065.000000 |
50% | 2861.000000 | 1.132395e+05 | 1.132405e+05 | 438.000000 | 1065.000000 |
75% | 5556.500000 | 1.687260e+05 | 1.687270e+05 | 718.000000 | 1065.000000 |
max | 16649.000000 | 1.500543e+06 | 1.500544e+06 | 998.000000 | 1065.000000 |
idx_scan_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | |
---|---|---|---|---|---|
execution_time | 1.000000 | 0.437456 | 0.437456 | 0.418959 | -0.356969 |
n_returned | 0.437456 | 1.000000 | 1.000000 | -0.429853 | -0.980535 |
n_processed | 0.437456 | 1.000000 | 1.000000 | -0.429853 | -0.980535 |
keys_length_in_bytes | 0.418959 | -0.429853 | -0.429853 | 1.000000 | 0.439227 |
average_document_size_in_bytes | -0.356969 | -0.980535 | -0.980535 | 0.439227 | 1.000000 |
sns.scatterplot(x=idx_scan_df['n_processed'], y=idx_scan_df['execution_time'])
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = idx_scan_df['execution_time']
X = idx_scan_df[['n_processed']]
X = sm.add_constant(X)
idx_scan_lm = sm.OLS(y, X).fit()
idx_scan_lm.summary()
Dep. Variable: | execution_time | R-squared: | 0.191 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.191 |
Method: | Least Squares | F-statistic: | 958.0 |
Date: | Wed, 12 Oct 2022 | Prob (F-statistic): | 5.56e-189 |
Time: | 11:54:53 | Log-Likelihood: | -38050. |
No. Observations: | 4050 | AIC: | 7.610e+04 |
Df Residuals: | 4048 | BIC: | 7.612e+04 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 2687.5063 | 53.790 | 49.963 | 0.000 | 2582.049 | 2792.964 |
n_processed | 0.0038 | 0.000 | 30.951 | 0.000 | 0.004 | 0.004 |
Omnibus: | 895.274 | Durbin-Watson: | 0.131 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1661.744 |
Skew: | 1.381 | Prob(JB): | 0.00 |
Kurtosis: | 4.490 | Cond. No. | 5.11e+05 |
y = idx_scan_df['execution_time']
X = idx_scan_df[['n_processed']]
X = sm.add_constant(X)
idx_scan_glm = sm.GLM(y, X).fit_constrained(([1, 0], idx_scan_df.execution_time.min()))
idx_scan_glm.summary()
Dep. Variable: | execution_time | No. Observations: | 4050 |
---|---|---|---|
Model: | GLM | Df Residuals: | 4049 |
Model Family: | Gaussian | Df Model: | 0 |
Link Function: | identity | Scale: | 1.3528e+07 |
Method: | IRLS | Log-Likelihood: | -38997. |
Date: | Wed, 12 Oct 2022 | Deviance: | 5.4776e+10 |
Time: | 11:55:11 | Pearson chi2: | 5.48e+10 |
No. Iterations: | 1 | Pseudo R-squ. (CS): | -0.2526 |
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 45.0000 | 0 | inf | 0.000 | 45.000 | 45.000 |
n_processed | 0.0070 | 0.000 | 52.850 | 0.000 | 0.007 | 0.007 |
abt_df = exp.extract_abt_nodes_with_field(noout_df, 'str_choice')
abt_df.count()
abt_type 20250 execution_time 20250 n_returned 20250 n_processed 20250 keys_length_in_bytes 20250 average_document_size_in_bytes 20250 run_id 20250 dtype: int64
import pandas as pd
def calibrate(abt_node_df: pd.DataFrame, variables: list[str] = None):
"""Calibrate the ABT node given in abd_node_df with the given model input variables."""
if variables is None:
variables = ['n_processed']
y = abt_node_df['execution_time']
X = abt_node_df[variables]
X = sm.add_constant(X)
nnls = LinearRegression(positive=True, fit_intercept=False)
model = nnls.fit(X, y)
y_pred = model.predict(X)
print(f'R2: {r2_score(y, y_pred)}')
print(f'Coefficients: {model.coef_}')
sns.scatterplot(x=abt_node_df['n_processed'], y=abt_node_df['execution_time'])
sns.lineplot(x=abt_node_df['n_processed'],y=y_pred, color='red')
calibrate(idx_scan_df, ['n_processed', 'keys_length_in_bytes'])
R2: 0.5987985329783202 Coefficients: [0.00000000e+00 5.37292461e-03 5.80927078e+00]
abt_df = exp.extract_abt_nodes_with_field(noout_df, 'str_choice')
abt_df.count()
abt_type 20250 execution_time 20250 n_returned 20250 n_processed 20250 keys_length_in_bytes 20250 average_document_size_in_bytes 20250 run_id 20250 dtype: int64
abt_df.abt_type.unique()
array(['IndexScan', 'Root', 'BinaryJoin', 'LimitSkip', 'Seek'], dtype=object)
idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
calibrate(idx_scan_df, ['n_processed']) # cost: 0.0026
R2: 0.19136805765412557 Coefficients: [2.68750630e+03 3.82693539e-03]
abt_df = exp.extract_abt_nodes_with_field(noout_df, 'str_uniform')
abt_df.count()
Series([], dtype: int64)
idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
calibrate(idx_scan_df, ['n_processed'])
R2: 0.3794698927458856 Coefficients: [5.7848001 0.0433854]
abt_df = exp.extract_abt_nodes_with_field(noout_df, 'int_normal1')
abt_df.count()
abt_type 14846 execution_time 14846 n_returned 14846 n_processed 14846 keys_length_in_bytes 14846 average_document_size_in_bytes 14846 run_id 14846 dtype: int64
idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
calibrate(idx_scan_df, ['n_processed'])
R2: 0.2893741110299918 Coefficients: [2.28274307e+02 2.91162622e-02]
abt_df = exp.extract_abt_nodes_with_field(noout_df, 'str_choice_200_')
abt_df.count()
abt_type 18000 execution_time 18000 n_returned 18000 n_processed 18000 keys_length_in_bytes 18000 average_document_size_in_bytes 18000 run_id 18000 dtype: int64
idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
calibrate(idx_scan_df, ['n_processed'])
R2: 0.2816592524864636 Coefficients: [2.41369410e+02 2.90449573e-02]
calibrate(idx_scan_df, ['n_processed', 'keys_length_in_bytes'])
R2: 0.590026568940176 Coefficients: [0. 0.01354941 4.61593592]
abt_df = exp.extract_abt_nodes_with_field(noout_df, 'str_choice_50_')
abt_df.count()
abt_type 2250 execution_time 2250 n_returned 2250 n_processed 2250 keys_length_in_bytes 2250 average_document_size_in_bytes 2250 run_id 2250 dtype: int64
idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
calibrate(idx_scan_df, ['n_processed'])
R2: 0.28494934654418846 Coefficients: [0. 0.00547649]
abt_df = exp.extract_abt_nodes_with_field(noout_df, 'str_choice_200_')
abt_df.count()
abt_type 18000 execution_time 18000 n_returned 18000 n_processed 18000 keys_length_in_bytes 18000 average_document_size_in_bytes 18000 run_id 18000 dtype: int64
idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
calibrate(idx_scan_df, ['n_processed'])
R2: 0.2893741110299918 Coefficients: [2.28274307e+02 2.91162622e-02]
df = await exp.load_calibration_data(database, 'calibrationDataEqualIntervalSmall')
noout_df = exp.remove_outliers(df, 0.0, 0.90)
noout_df.describe()
abt_df = exp.extract_abt_nodes_with_field(noout_df, 'str_choice_20_1')
#abt_df.count()
noout_df.describe()
total_execution_time | |
---|---|
count | 84.000000 |
mean | 77.285714 |
std | 29.445711 |
min | 33.000000 |
25% | 51.500000 |
50% | 75.500000 |
75% | 106.000000 |
max | 122.000000 |
idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
calibrate(idx_scan_df, ['n_processed'])
R2: 0.14011342736457788 Coefficients: [15.38671068 0.02208039]