import math
import bson.json_util as json
import seaborn as sns
import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
import sys
sys.path.append('/home/ubuntu/mongo/buildscripts/cost_model')
import experiment as exp
from config import DatabaseConfig
from database_instance import DatabaseInstance
database_config = DatabaseConfig(connection_string='mongodb://localhost',
database_name='abt_calibration_big', dump_path='',
restore_from_dump=False, dump_on_exit=False)
database = DatabaseInstance(database_config)
df = await exp.load_calibration_data(database, 'calibrationData')
df.describe()
total_execution_time | |
---|---|
count | 1.564000e+04 |
mean | 6.617052e+05 |
std | 1.058255e+06 |
min | 1.000000e+01 |
25% | 7.275500e+03 |
50% | 6.583500e+04 |
75% | 1.059969e+06 |
max | 7.024473e+06 |
noout_df = exp.remove_outliers(df, 0.0, 0.90)
noout_df.describe()
total_execution_time | |
---|---|
count | 1.411400e+04 |
mean | 6.467882e+05 |
std | 1.036968e+06 |
min | 1.000000e+01 |
25% | 6.824250e+03 |
50% | 6.364950e+04 |
75% | 1.021397e+06 |
max | 7.015423e+06 |
abt_df = exp.extract_abt_nodes(noout_df)
abt_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
0 | LimitSkip | 2209 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 0 |
1 | BinaryJoin | 58384 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 0 |
2 | IndexScan | 157 | 59685 | 59686 | 3 | 42 | 0 | 633624617919d20f9deae62e | 0 |
3 | Seek | 2906 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 0 |
4 | Root | 0 | 59685 | 0 | 3 | 42 | 0 | 633624617919d20f9deae62e | 0 |
abt_df.abt_type.unique()
array(['LimitSkip', 'BinaryJoin', 'IndexScan', 'Seek', 'Root', 'PhysicalScan', 'Filter', 'MergeJoin', 'Union', 'HashJoin', 'GroupBy', 'Evaluation', 'Unwind'], dtype=object)
merge_join_df = abt_df[abt_df.abt_type == 'MergeJoin']
merge_join_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
18365 | MergeJoin | 633 | 0 | 1252 | 1 | 110 | 0 | 6336451e7919d20f9deafd8e | 5440 |
18390 | MergeJoin | 617 | 0 | 1252 | 1 | 110 | 0 | 6336451e7919d20f9deafd8e | 5444 |
18397 | MergeJoin | 616 | 0 | 1252 | 1 | 110 | 0 | 6336451e7919d20f9deafd8e | 5445 |
18410 | MergeJoin | 629 | 0 | 1252 | 1 | 110 | 0 | 6336451e7919d20f9deafd8e | 5448 |
18417 | MergeJoin | 623 | 0 | 1252 | 1 | 110 | 0 | 6336451e7919d20f9deafd8e | 5449 |
exp.print_trees(noout_df, merge_join_df)
SBE stage: nlj, plaNodeId: 7, totalExecutionTime: 647, nReturned: 0, nProcessed: 0 | stage: mj, plaNodeId: 4, totalExecutionTime: 647, nReturned: 0, nProcessed: 1252 | | stage: ixseek, plaNodeId: 0, totalExecutionTime: 11, nReturned: 612, nProcessed: 612 | | stage: ixseek, plaNodeId: 1, totalExecutionTime: 3, nReturned: 640, nProcessed: 641 | stage: limitskip, plaNodeId: 6, totalExecutionTime: 0, nReturned: 0, nProcessed: 0 | | stage: seek, plaNodeId: 5, totalExecutionTime: 0, nReturned: 0, nProcessed: 0 ABT nodeType: Root, plaNodeId: 8 | nodeType: BinaryJoin, plaNodeId: 7 | | nodeType: MergeJoin, plaNodeId: 4 | | | nodeType: IndexScan, plaNodeId: 0 | | | nodeType: Union, plaNodeId: 3 | | nodeType: LimitSkip, plaNodeId: 6 | | | nodeType: Seek, plaNodeId: 5
merge_join_df.describe()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
count | 1726.000000 | 1726.000000 | 1726.000000 | 1726.0 | 1726.0 | 1726.0 | 1726.000000 |
mean | 9322.962920 | 37.786211 | 19308.841251 | 1.0 | 110.0 | 0.0 | 8868.545771 |
std | 6058.184291 | 34.792521 | 12577.694721 | 0.0 | 0.0 | 0.0 | 1951.001476 |
min | 176.000000 | 0.000000 | 352.000000 | 1.0 | 110.0 | 0.0 | 5440.000000 |
25% | 4273.250000 | 10.000000 | 8853.000000 | 1.0 | 110.0 | 0.0 | 7217.500000 |
50% | 8177.000000 | 24.000000 | 16801.000000 | 1.0 | 110.0 | 0.0 | 8880.500000 |
75% | 13567.250000 | 62.750000 | 28204.250000 | 1.0 | 110.0 | 0.0 | 10497.500000 |
max | 24875.000000 | 147.000000 | 52044.000000 | 1.0 | 110.0 | 0.0 | 12178.000000 |
merge_join_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
execution_time | 1.000000 | 0.951772 | 0.999881 | NaN | NaN | NaN | 0.618293 |
n_returned | 0.951772 | 1.000000 | 0.951941 | NaN | NaN | NaN | 0.445914 |
n_processed | 0.999881 | 0.951941 | 1.000000 | NaN | NaN | NaN | 0.617830 |
keys_length_in_bytes | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
average_document_size_in_bytes | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
number_of_fields | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
source | 0.618293 | 0.445914 | 0.617830 | NaN | NaN | NaN | 1.000000 |
sns.scatterplot(x=merge_join_df['n_processed'], y=merge_join_df['execution_time'])
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = merge_join_df['execution_time']
X = merge_join_df[['n_processed']]
X = sm.add_constant(X)
mj_lm = sm.OLS(y, X).fit()
mj_lm.summary()
Dep. Variable: | execution_time | R-squared: | 1.000 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 1.000 |
Method: | Least Squares | F-statistic: | 7.237e+06 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 10:57:14 | Log-Likelihood: | -10281. |
No. Observations: | 1726 | AIC: | 2.057e+04 |
Df Residuals: | 1724 | BIC: | 2.058e+04 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 23.7559 | 4.125 | 5.759 | 0.000 | 15.665 | 31.847 |
n_processed | 0.4816 | 0.000 | 2690.101 | 0.000 | 0.481 | 0.482 |
Omnibus: | 106.850 | Durbin-Watson: | 0.356 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 265.553 |
Skew: | 0.349 | Prob(JB): | 2.17e-58 |
Kurtosis: | 4.790 | Cond. No. | 4.22e+04 |
y_pred = mj_lm.predict(X)
sns.scatterplot(x=merge_join_df['n_processed'], y=merge_join_df['execution_time'])
sns.lineplot(x=merge_join_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = merge_join_df['execution_time']
X = merge_join_df[['n_processed', 'n_returned']]
X = sm.add_constant(X)
mj_lm = sm.OLS(y, X).fit()
mj_lm.summary()
Dep. Variable: | execution_time | R-squared: | 1.000 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 1.000 |
Method: | Least Squares | F-statistic: | 3.617e+06 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 10:57:18 | Log-Likelihood: | -10281. |
No. Observations: | 1726 | AIC: | 2.057e+04 |
Df Residuals: | 1723 | BIC: | 2.058e+04 |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 22.4286 | 4.964 | 4.518 | 0.000 | 12.692 | 32.165 |
n_processed | 0.4819 | 0.001 | 824.206 | 0.000 | 0.481 | 0.483 |
n_returned | -0.1016 | 0.211 | -0.481 | 0.631 | -0.516 | 0.313 |
Omnibus: | 107.983 | Durbin-Watson: | 0.356 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 265.769 |
Skew: | 0.357 | Prob(JB): | 1.95e-58 |
Kurtosis: | 4.785 | Cond. No. | 5.08e+04 |
exp.calibrate(merge_join_df)
R2: 0.9997618248163389 Coefficients: [23.75586675 0.48160358]
binary_join_df = abt_df[abt_df.abt_type == 'BinaryJoin']
binary_join_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
1 | BinaryJoin | 58384 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 0 |
6 | BinaryJoin | 58437 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 1 |
11 | BinaryJoin | 58316 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 2 |
16 | BinaryJoin | 58407 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 4 |
21 | BinaryJoin | 58410 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 5 |
exp.print_trees(noout_df, binary_join_df)
SBE stage: nlj, plaNodeId: 3, totalExecutionTime: 63656, nReturned: 59685, nProcessed: 59685 | stage: ixseek, plaNodeId: 0, totalExecutionTime: 157, nReturned: 59685, nProcessed: 59686 | stage: limitskip, plaNodeId: 2, totalExecutionTime: 5115, nReturned: 59685, nProcessed: 59685 | | stage: seek, plaNodeId: 1, totalExecutionTime: 2906, nReturned: 59685, nProcessed: 59685 ABT nodeType: Root, plaNodeId: 4 | nodeType: BinaryJoin, plaNodeId: 3 | | nodeType: IndexScan, plaNodeId: 0 | | nodeType: LimitSkip, plaNodeId: 2 | | | nodeType: Seek, plaNodeId: 1
binary_join_df.describe()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
count | 1.012700e+04 | 1.012700e+04 | 1.012700e+04 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 |
mean | 4.756040e+04 | 4.818516e+04 | 4.859375e+04 | 2.178434 | 108.670485 | 0.119285 | 8783.434976 |
std | 1.276394e+05 | 1.299854e+05 | 1.298404e+05 | 2.882973 | 18.483604 | 0.473670 | 3632.120204 |
min | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000 | 42.000000 | 0.000000 | 0.000000 |
25% | 6.800000e+01 | 2.000000e+00 | 6.400000e+01 | 1.000000 | 110.000000 | 0.000000 | 6209.500000 |
50% | 1.222000e+03 | 7.500000e+01 | 9.580000e+02 | 1.000000 | 110.000000 | 0.000000 | 9009.000000 |
75% | 1.060500e+04 | 1.139900e+04 | 1.171700e+04 | 1.000000 | 110.000000 | 0.000000 | 11801.500000 |
max | 1.264509e+06 | 1.268990e+06 | 1.268990e+06 | 12.000000 | 133.000000 | 2.000000 | 14788.000000 |
binary_join_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
execution_time | 1.000000 | 0.999830 | 0.999918 | 0.272291 | -0.068941 | 0.446514 | -0.139822 |
n_returned | 0.999830 | 1.000000 | 0.999955 | 0.278109 | -0.067185 | 0.443475 | -0.145968 |
n_processed | 0.999918 | 0.999955 | 1.000000 | 0.277134 | -0.067034 | 0.443178 | -0.144273 |
keys_length_in_bytes | 0.272291 | 0.278109 | 0.277134 | 1.000000 | -0.132785 | -0.102948 | -0.630184 |
average_document_size_in_bytes | -0.068941 | -0.067185 | -0.067034 | -0.132785 | 1.000000 | 0.018116 | 0.260180 |
number_of_fields | 0.446514 | 0.443475 | 0.443178 | -0.102948 | 0.018116 | 1.000000 | 0.387364 |
source | -0.139822 | -0.145968 | -0.144273 | -0.630184 | 0.260180 | 0.387364 | 1.000000 |
sns.scatterplot(x=binary_join_df['n_processed'], y=binary_join_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
sns.scatterplot(x=binary_join_df['n_returned'], y=binary_join_df['execution_time'], color='red')
<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>
y = binary_join_df['execution_time']
X = binary_join_df[['n_processed', 'n_returned']]
X = sm.add_constant(X)
bj_lm = sm.OLS(y, X).fit()
bj_lm.summary()
Dep. Variable: | execution_time | R-squared: | 1.000 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 1.000 |
Method: | Least Squares | F-statistic: | 3.538e+07 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 10:57:22 | Log-Likelihood: | -88608. |
No. Observations: | 10127 | AIC: | 1.772e+05 |
Df Residuals: | 10124 | BIC: | 1.772e+05 |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -421.7009 | 17.160 | -24.575 | 0.000 | -455.337 | -388.065 |
n_processed | 1.4524 | 0.012 | 118.113 | 0.000 | 1.428 | 1.476 |
n_returned | -0.4689 | 0.012 | -38.175 | 0.000 | -0.493 | -0.445 |
Omnibus: | 3542.682 | Durbin-Watson: | 0.428 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 844656.937 |
Skew: | -0.421 | Prob(JB): | 0.00 |
Kurtosis: | 47.733 | Cond. No. | 2.22e+05 |
y = binary_join_df['execution_time']
X = binary_join_df[['n_returned']]
X = sm.add_constant(X)
bj_lm = sm.OLS(y, X).fit()
bj_lm.summary()
Dep. Variable: | execution_time | R-squared: | 1.000 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 1.000 |
Method: | Least Squares | F-statistic: | 2.975e+07 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 10:57:22 | Log-Likelihood: | -92995. |
No. Observations: | 10127 | AIC: | 1.860e+05 |
Df Residuals: | 10125 | BIC: | 1.860e+05 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 252.9461 | 24.951 | 10.138 | 0.000 | 204.037 | 301.855 |
n_returned | 0.9818 | 0.000 | 5454.579 | 0.000 | 0.981 | 0.982 |
Omnibus: | 6057.968 | Durbin-Watson: | 0.979 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 202589.477 |
Skew: | 2.315 | Prob(JB): | 0.00 |
Kurtosis: | 24.417 | Cond. No. | 1.48e+05 |
y_pred = bj_lm.predict(X)
sns.scatterplot(x=binary_join_df['n_returned'], y=binary_join_df['execution_time'])
sns.lineplot(x=binary_join_df['n_returned'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>
y = binary_join_df['execution_time']
X = binary_join_df[['n_processed']]
X = sm.add_constant(X)
bj_lm = sm.OLS(y, X).fit()
bj_lm.summary()
Dep. Variable: | execution_time | R-squared: | 1.000 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 1.000 |
Method: | Least Squares | F-statistic: | 6.186e+07 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 10:57:32 | Log-Likelihood: | -89289. |
No. Observations: | 10127 | AIC: | 1.786e+05 |
Df Residuals: | 10125 | BIC: | 1.786e+05 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -205.7031 | 17.326 | -11.873 | 0.000 | -239.665 | -171.741 |
n_processed | 0.9830 | 0.000 | 7865.061 | 0.000 | 0.983 | 0.983 |
Omnibus: | 3037.117 | Durbin-Watson: | 0.497 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 504315.454 |
Skew: | -0.099 | Prob(JB): | 0.00 |
Kurtosis: | 37.571 | Cond. No. | 1.48e+05 |
y_pred = bj_lm.predict(X)
sns.scatterplot(x=binary_join_df['n_processed'], y=binary_join_df['execution_time'])
sns.lineplot(x=binary_join_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
exp.calibrate(binary_join_df)
R2: 0.9998340702333239 Coefficients: [0. 0.98244794]
hash_join_df = abt_df[abt_df.abt_type == 'HashJoin']
hash_join_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
18764 | HashJoin | 2812 | 0 | 4741 | 1 | 110 | 0 | 6336451f7919d20f9deafd95 | 5510 |
18807 | HashJoin | 2811 | 0 | 4741 | 1 | 110 | 0 | 6336451f7919d20f9deafd95 | 5518 |
18958 | HashJoin | 19862 | 22 | 27150 | 1 | 110 | 0 | 633645207919d20f9deafd98 | 5545 |
18965 | HashJoin | 19837 | 22 | 27150 | 1 | 110 | 0 | 633645207919d20f9deafd98 | 5546 |
18996 | HashJoin | 17777 | 20 | 25612 | 1 | 110 | 0 | 633645207919d20f9deafd99 | 5551 |
exp.print_trees(noout_df, hash_join_df)
SBE stage: nlj, plaNodeId: 7, totalExecutionTime: 2871, nReturned: 0, nProcessed: 0 | stage: hj, plaNodeId: 4, totalExecutionTime: 2870, nReturned: 0, nProcessed: 4741 | | stage: ixseek, plaNodeId: 1, totalExecutionTime: 52, nReturned: 4568, nProcessed: 4569 | | stage: ixseek, plaNodeId: 0, totalExecutionTime: 6, nReturned: 173, nProcessed: 174 | stage: limitskip, plaNodeId: 6, totalExecutionTime: 0, nReturned: 0, nProcessed: 0 | | stage: seek, plaNodeId: 5, totalExecutionTime: 0, nReturned: 0, nProcessed: 0 ABT nodeType: Root, plaNodeId: 8 | nodeType: BinaryJoin, plaNodeId: 7 | | nodeType: HashJoin, plaNodeId: 4 | | | nodeType: IndexScan, plaNodeId: 0 | | | nodeType: Union, plaNodeId: 3 | | nodeType: LimitSkip, plaNodeId: 6 | | | nodeType: Seek, plaNodeId: 5
hash_join_df.describe()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
count | 1.834000e+03 | 1834.000000 | 1.834000e+03 | 1834.0 | 1834.0 | 1834.0 | 1834.000000 |
mean | 1.858712e+06 | 4577.565431 | 1.504955e+06 | 1.0 | 110.0 | 0.0 | 8876.460742 |
std | 1.665740e+06 | 4069.831101 | 1.261856e+06 | 0.0 | 0.0 | 0.0 | 1969.201490 |
min | 2.811000e+03 | 0.000000 | 4.741000e+03 | 1.0 | 110.0 | 0.0 | 5510.000000 |
25% | 4.188332e+05 | 950.000000 | 4.108400e+05 | 1.0 | 110.0 | 0.0 | 7198.250000 |
50% | 1.389704e+06 | 3552.000000 | 1.149390e+06 | 1.0 | 110.0 | 0.0 | 8870.500000 |
75% | 2.934652e+06 | 7177.000000 | 2.374488e+06 | 1.0 | 110.0 | 0.0 | 10545.750000 |
max | 6.931686e+06 | 15732.000000 | 4.978615e+06 | 1.0 | 110.0 | 0.0 | 12232.000000 |
hash_join_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
execution_time | 1.000000 | 0.675287 | 0.995430 | NaN | NaN | NaN | 0.527774 |
n_returned | 0.675287 | 1.000000 | 0.688905 | NaN | NaN | NaN | 0.473347 |
n_processed | 0.995430 | 0.688905 | 1.000000 | NaN | NaN | NaN | 0.519010 |
keys_length_in_bytes | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
average_document_size_in_bytes | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
number_of_fields | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
source | 0.527774 | 0.473347 | 0.519010 | NaN | NaN | NaN | 1.000000 |
sns.scatterplot(x=hash_join_df['n_processed'], y=hash_join_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = hash_join_df['execution_time']
X = hash_join_df[['n_processed', 'n_returned']]
X = sm.add_constant(X)
hj_lm = sm.OLS(y, X).fit()
hj_lm.summary()
Dep. Variable: | execution_time | R-squared: | 0.991 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.991 |
Method: | Least Squares | F-statistic: | 1.018e+05 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 10:57:59 | Log-Likelihood: | -24547. |
No. Observations: | 1834 | AIC: | 4.910e+04 |
Df Residuals: | 1831 | BIC: | 4.912e+04 |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -1.088e+05 | 5921.432 | -18.373 | 0.000 | -1.2e+05 | -9.72e+04 |
n_processed | 1.3322 | 0.004 | 331.593 | 0.000 | 1.324 | 1.340 |
n_returned | -8.1563 | 1.246 | -6.548 | 0.000 | -10.599 | -5.713 |
Omnibus: | 398.945 | Durbin-Watson: | 0.148 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1459.629 |
Skew: | 1.033 | Prob(JB): | 0.00 |
Kurtosis: | 6.851 | Cond. No. | 3.17e+06 |
y = hash_join_df['execution_time']
X = hash_join_df[['n_processed']]
X = sm.add_constant(X)
hj_lm = sm.OLS(y, X).fit()
hj_lm.summary()
Dep. Variable: | execution_time | R-squared: | 0.991 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.991 |
Method: | Least Squares | F-statistic: | 1.991e+05 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 10:57:59 | Log-Likelihood: | -24568. |
No. Observations: | 1834 | AIC: | 4.914e+04 |
Df Residuals: | 1832 | BIC: | 4.915e+04 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -1.189e+05 | 5783.543 | -20.551 | 0.000 | -1.3e+05 | -1.08e+05 |
n_processed | 1.3140 | 0.003 | 446.169 | 0.000 | 1.308 | 1.320 |
Omnibus: | 368.827 | Durbin-Watson: | 0.143 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1094.659 |
Skew: | 1.022 | Prob(JB): | 1.98e-238 |
Kurtosis: | 6.185 | Cond. No. | 3.06e+06 |
y_pred = hj_lm.predict(X)
sns.scatterplot(x=hash_join_df['n_processed'], y=hash_join_df['execution_time'])
sns.lineplot(x=hash_join_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
exp.calibrate(hash_join_df)
R2: 0.9887787330156012 Coefficients: [0. 1.26765332]
union_df = abt_df[abt_df.abt_type == 'Union']
union_df = union_df[union_df.execution_time > 0]
union_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
58868 | Union | 6 | 0 | 1252 | 1 | 110 | 0 | 63365d777919d20f9deb1ac6 | 12242 |
58911 | Union | 6 | 0 | 1252 | 1 | 110 | 0 | 63365d777919d20f9deb1ac6 | 12249 |
58984 | Union | 6 | 0 | 1577 | 1 | 110 | 0 | 63365d777919d20f9deb1ac8 | 12261 |
58991 | Union | 4 | 0 | 1577 | 1 | 110 | 0 | 63365d777919d20f9deb1ac8 | 12262 |
58998 | Union | 13 | 0 | 1577 | 1 | 110 | 0 | 63365d777919d20f9deb1ac8 | 12264 |
exp.print_trees(noout_df, union_df)
SBE stage: nlj, plaNodeId: 9, totalExecutionTime: 1482, nReturned: 0, nProcessed: 0 | stage: filter, plaNodeId: 6, totalExecutionTime: 1479, nReturned: 0, nProcessed: 1252 | | stage: group, plaNodeId: 5, totalExecutionTime: 1292, nReturned: 1252, nProcessed: 1252 | | | stage: union, plaNodeId: 4, totalExecutionTime: 23, nReturned: 1252, nProcessed: 1252 | | | | stage: project, plaNodeId: 1, totalExecutionTime: 12, nReturned: 612, nProcessed: 612 | | | | | stage: ixseek, plaNodeId: 0, totalExecutionTime: 11, nReturned: 612, nProcessed: 613 | | | | stage: project, plaNodeId: 3, totalExecutionTime: 5, nReturned: 640, nProcessed: 640 | | | | | stage: ixseek, plaNodeId: 2, totalExecutionTime: 4, nReturned: 640, nProcessed: 641 | stage: limitskip, plaNodeId: 8, totalExecutionTime: 0, nReturned: 0, nProcessed: 0 | | stage: seek, plaNodeId: 7, totalExecutionTime: 0, nReturned: 0, nProcessed: 0 ABT nodeType: Root, plaNodeId: 10 | nodeType: BinaryJoin, plaNodeId: 9 | | nodeType: Filter, plaNodeId: 6 | | | nodeType: GroupBy, plaNodeId: 5 | | | | nodeType: Union, plaNodeId: 4 | | nodeType: LimitSkip, plaNodeId: 8 | | | nodeType: Seek, plaNodeId: 7
union_df.describe()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
count | 799.000000 | 799.000000 | 799.000000 | 799.0 | 799.0 | 799.0 | 799.000000 |
mean | 83.887359 | 38.868586 | 19714.630788 | 1.0 | 110.0 | 0.0 | 13081.634543 |
std | 71.748585 | 34.823026 | 12736.668332 | 0.0 | 0.0 | 0.0 | 493.081987 |
min | 1.000000 | 0.000000 | 813.000000 | 1.0 | 110.0 | 0.0 | 12242.000000 |
25% | 18.500000 | 11.000000 | 8938.000000 | 1.0 | 110.0 | 0.0 | 12672.500000 |
50% | 71.000000 | 27.000000 | 17739.000000 | 1.0 | 110.0 | 0.0 | 13086.000000 |
75% | 134.500000 | 61.000000 | 29181.000000 | 1.0 | 110.0 | 0.0 | 13503.000000 |
max | 360.000000 | 147.000000 | 52044.000000 | 1.0 | 110.0 | 0.0 | 13925.000000 |
union_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
execution_time | 1.000000 | 0.920163 | 0.963655 | NaN | NaN | NaN | 0.656127 |
n_returned | 0.920163 | 1.000000 | 0.946314 | NaN | NaN | NaN | 0.483939 |
n_processed | 0.963655 | 0.946314 | 1.000000 | NaN | NaN | NaN | 0.670496 |
keys_length_in_bytes | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
average_document_size_in_bytes | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
number_of_fields | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
source | 0.656127 | 0.483939 | 0.670496 | NaN | NaN | NaN | 1.000000 |
sns.scatterplot(x=union_df['n_processed'], y=union_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = union_df['execution_time']
X = union_df[['n_processed']]
X = sm.add_constant(X)
union_lm = sm.OLS(y, X).fit()
union_lm.summary()
Dep. Variable: | execution_time | R-squared: | 0.929 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.929 |
Method: | Least Squares | F-statistic: | 1.037e+04 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 10:58:08 | Log-Likelihood: | -3492.9 |
No. Observations: | 799 | AIC: | 6990. |
Df Residuals: | 797 | BIC: | 6999. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -23.1333 | 1.251 | -18.493 | 0.000 | -25.589 | -20.678 |
n_processed | 0.0054 | 5.33e-05 | 101.835 | 0.000 | 0.005 | 0.006 |
Omnibus: | 172.558 | Durbin-Watson: | 1.587 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 581.131 |
Skew: | 1.014 | Prob(JB): | 6.44e-127 |
Kurtosis: | 6.653 | Cond. No. | 4.33e+04 |
y_pred = union_lm.predict(X)
sns.scatterplot(x=union_df['n_processed'], y=union_df['execution_time'])
sns.lineplot(x=union_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
exp.calibrate(union_df)
R2: 0.8980073052101615 Coefficients: [0. 0.00460031]
ls_df = abt_df[abt_df.abt_type == 'LimitSkip']
ls_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
0 | LimitSkip | 2209 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 0 |
5 | LimitSkip | 2204 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 1 |
10 | LimitSkip | 2123 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 2 |
15 | LimitSkip | 2151 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 4 |
20 | LimitSkip | 2207 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 5 |
ls_df.describe()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
count | 10127.000000 | 1.012700e+04 | 1.012700e+04 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 |
mean | 3718.346104 | 4.818516e+04 | 4.859375e+04 | 2.178434 | 108.670485 | 0.119285 | 8783.434976 |
std | 14141.893659 | 1.299854e+05 | 1.298404e+05 | 2.882973 | 18.483604 | 0.473670 | 3632.120204 |
min | -12.000000 | 0.000000e+00 | 0.000000e+00 | 1.000000 | 42.000000 | 0.000000 | 0.000000 |
25% | 4.000000 | 2.000000e+00 | 6.400000e+01 | 1.000000 | 110.000000 | 0.000000 | 6209.500000 |
50% | 81.000000 | 7.500000e+01 | 9.580000e+02 | 1.000000 | 110.000000 | 0.000000 | 9009.000000 |
75% | 1341.500000 | 1.139900e+04 | 1.171700e+04 | 1.000000 | 110.000000 | 0.000000 | 11801.500000 |
max | 177869.000000 | 1.268990e+06 | 1.268990e+06 | 12.000000 | 133.000000 | 2.000000 | 14788.000000 |
ls_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
execution_time | 1.000000 | 0.892805 | 0.893080 | 0.109001 | 0.032698 | 0.545629 | 0.083271 |
n_returned | 0.892805 | 1.000000 | 0.999955 | 0.278109 | -0.067185 | 0.443475 | -0.145968 |
n_processed | 0.893080 | 0.999955 | 1.000000 | 0.277134 | -0.067034 | 0.443178 | -0.144273 |
keys_length_in_bytes | 0.109001 | 0.278109 | 0.277134 | 1.000000 | -0.132785 | -0.102948 | -0.630184 |
average_document_size_in_bytes | 0.032698 | -0.067185 | -0.067034 | -0.132785 | 1.000000 | 0.018116 | 0.260180 |
number_of_fields | 0.545629 | 0.443475 | 0.443178 | -0.102948 | 0.018116 | 1.000000 | 0.387364 |
source | 0.083271 | -0.145968 | -0.144273 | -0.630184 | 0.260180 | 0.387364 | 1.000000 |
sns.scatterplot(x=ls_df['n_processed'], y=ls_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
sns.scatterplot(x=ls_df['n_returned'], y=ls_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>
y = ls_df['execution_time']
X = ls_df[['n_processed']]
X = sm.add_constant(X)
ls_lm = sm.OLS(y, X).fit()
ls_lm.summary()
Dep. Variable: | execution_time | R-squared: | 0.798 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.798 |
Method: | Least Squares | F-statistic: | 3.990e+04 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 10:58:13 | Log-Likelihood: | -1.0306e+05 |
No. Observations: | 10127 | AIC: | 2.061e+05 |
Df Residuals: | 10125 | BIC: | 2.061e+05 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -1008.4678 | 67.510 | -14.938 | 0.000 | -1140.801 | -876.134 |
n_processed | 0.0973 | 0.000 | 199.744 | 0.000 | 0.096 | 0.098 |
Omnibus: | 2707.668 | Durbin-Watson: | 0.132 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 184892.821 |
Skew: | 0.342 | Prob(JB): | 0.00 |
Kurtosis: | 23.922 | Cond. No. | 1.48e+05 |
y_pred = ls_lm.predict(X)
sns.scatterplot(x=ls_df['n_processed'], y=ls_df['execution_time'])
sns.lineplot(x=ls_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = ls_df['execution_time']
X = ls_df[['n_processed']]
X = sm.add_constant(X)
ls_glm = sm.GLM(y, X).fit_constrained(([1, 0], ls_df.execution_time.min()))
ls_glm.summary()
Dep. Variable: | execution_time | No. Observations: | 10127 |
---|---|---|---|
Model: | GLM | Df Residuals: | 10126 |
Model Family: | Gaussian | Df Model: | 0 |
Link Function: | identity | Scale: | 4.1351e+07 |
Method: | IRLS | Log-Likelihood: | -1.0317e+05 |
Date: | Fri, 30 Sep 2022 | Deviance: | 4.1872e+11 |
Time: | 10:58:26 | Pearson chi2: | 4.19e+11 |
No. Iterations: | 1 | Pseudo R-squ. (CS): | 0.9784 |
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -12.0000 | 0 | -inf | 0.000 | -12.000 | -12.000 |
n_processed | 0.0948 | 0.000 | 205.562 | 0.000 | 0.094 | 0.096 |
y_pred = ls_glm.predict(X)
sns.scatterplot(x=ls_df['n_processed'], y=ls_df['execution_time'])
sns.lineplot(x=ls_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
exp.calibrate(ls_df)
R2: 0.7931314002167056 Coefficients: [0. 0.09472212]
group_df = abt_df[abt_df.abt_type == 'GroupBy']
group_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
58871 | GroupBy | 1269 | 0 | 1252 | 1 | 110 | 0 | 63365d777919d20f9deb1ac6 | 12242 |
58914 | GroupBy | 1260 | 0 | 1252 | 1 | 110 | 0 | 63365d777919d20f9deb1ac6 | 12249 |
58987 | GroupBy | 1586 | 0 | 1577 | 1 | 110 | 0 | 63365d777919d20f9deb1ac8 | 12261 |
58994 | GroupBy | 1607 | 0 | 1577 | 1 | 110 | 0 | 63365d777919d20f9deb1ac8 | 12262 |
59001 | GroupBy | 1575 | 0 | 1577 | 1 | 110 | 0 | 63365d777919d20f9deb1ac8 | 12264 |
exp.print_trees(df, group_df)
SBE stage: nlj, plaNodeId: 9, totalExecutionTime: 1482, nReturned: 0, nProcessed: 0 | stage: filter, plaNodeId: 6, totalExecutionTime: 1479, nReturned: 0, nProcessed: 1252 | | stage: group, plaNodeId: 5, totalExecutionTime: 1292, nReturned: 1252, nProcessed: 1252 | | | stage: union, plaNodeId: 4, totalExecutionTime: 23, nReturned: 1252, nProcessed: 1252 | | | | stage: project, plaNodeId: 1, totalExecutionTime: 12, nReturned: 612, nProcessed: 612 | | | | | stage: ixseek, plaNodeId: 0, totalExecutionTime: 11, nReturned: 612, nProcessed: 613 | | | | stage: project, plaNodeId: 3, totalExecutionTime: 5, nReturned: 640, nProcessed: 640 | | | | | stage: ixseek, plaNodeId: 2, totalExecutionTime: 4, nReturned: 640, nProcessed: 641 | stage: limitskip, plaNodeId: 8, totalExecutionTime: 0, nReturned: 0, nProcessed: 0 | | stage: seek, plaNodeId: 7, totalExecutionTime: 0, nReturned: 0, nProcessed: 0 ABT nodeType: Root, plaNodeId: 10 | nodeType: BinaryJoin, plaNodeId: 9 | | nodeType: Filter, plaNodeId: 6 | | | nodeType: GroupBy, plaNodeId: 5 | | | | nodeType: Union, plaNodeId: 4 | | nodeType: LimitSkip, plaNodeId: 8 | | | nodeType: Seek, plaNodeId: 7
group_df.describe()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
count | 810.000000 | 810.000000 | 810.000000 | 810.0 | 810.0 | 810.0 | 810.000000 |
mean | 27328.232099 | 38.517284 | 19519.030864 | 1.0 | 110.0 | 0.0 | 13076.240741 |
std | 19411.364976 | 34.729259 | 12689.273304 | 0.0 | 0.0 | 0.0 | 495.434536 |
min | 881.000000 | 0.000000 | 813.000000 | 1.0 | 110.0 | 0.0 | 12242.000000 |
25% | 10905.000000 | 10.000000 | 8925.000000 | 1.0 | 110.0 | 0.0 | 12641.750000 |
50% | 23979.000000 | 25.000000 | 17295.000000 | 1.0 | 110.0 | 0.0 | 13082.000000 |
75% | 40594.000000 | 61.000000 | 28405.000000 | 1.0 | 110.0 | 0.0 | 13501.750000 |
max | 82277.000000 | 147.000000 | 51897.000000 | 1.0 | 110.0 | 0.0 | 13925.000000 |
group_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
execution_time | 1.000000 | 0.946915 | 0.999021 | NaN | NaN | NaN | 0.674617 |
n_returned | 0.946915 | 1.000000 | 0.946189 | NaN | NaN | NaN | 0.482164 |
n_processed | 0.999021 | 0.946189 | 1.000000 | NaN | NaN | NaN | 0.668759 |
keys_length_in_bytes | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
average_document_size_in_bytes | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
number_of_fields | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
source | 0.674617 | 0.482164 | 0.668759 | NaN | NaN | NaN | 1.000000 |
sns.scatterplot(x=group_df['n_returned'], y=group_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>
sns.scatterplot(x=group_df['n_processed'], y=group_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = group_df['execution_time']
X = group_df[['n_processed']]
X = sm.add_constant(X)
group_lm = sm.OLS(y, X).fit()
group_lm.summary()
Dep. Variable: | execution_time | R-squared: | 0.998 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.998 |
Method: | Least Squares | F-statistic: | 4.121e+05 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 10:58:53 | Log-Likelihood: | -6620.7 |
No. Observations: | 810 | AIC: | 1.325e+04 |
Df Residuals: | 808 | BIC: | 1.325e+04 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -2501.6979 | 55.414 | -45.146 | 0.000 | -2610.469 | -2392.926 |
n_processed | 1.5282 | 0.002 | 641.951 | 0.000 | 1.524 | 1.533 |
Omnibus: | 311.031 | Durbin-Watson: | 0.432 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1720.593 |
Skew: | 1.654 | Prob(JB): | 0.00 |
Kurtosis: | 9.327 | Cond. No. | 4.27e+04 |
y_pred = group_lm.predict(X)
sns.scatterplot(x=group_df['n_processed'], y=group_df['execution_time'])
sns.lineplot(x=group_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = group_df['execution_time']
X = group_df[['n_processed']]
X = sm.add_constant(X)
group_glm = sm.GLM(y, X).fit_constrained(([1, 0], group_df.execution_time.min()))
group_glm.summary()
Dep. Variable: | execution_time | No. Observations: | 810 |
---|---|---|---|
Model: | GLM | Df Residuals: | 809 |
Model Family: | Gaussian | Df Model: | 0 |
Link Function: | identity | Scale: | 4.1379e+06 |
Method: | IRLS | Log-Likelihood: | -7319.3 |
Date: | Fri, 30 Sep 2022 | Deviance: | 3.3476e+09 |
Time: | 10:58:55 | Pearson chi2: | 3.35e+09 |
No. Iterations: | 1 | Pseudo R-squ. (CS): | 1.000 |
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 881.0000 | 0 | inf | 0.000 | 881.000 | 881.000 |
n_processed | 1.4064 | 0.003 | 458.015 | 0.000 | 1.400 | 1.412 |
y_pred = group_glm.predict(X)
sns.scatterplot(x=group_df['n_processed'], y=group_df['execution_time'])
sns.lineplot(x=group_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
exp.calibrate(group_df)
R2: 0.9931070626694147 Coefficients: [0. 1.43812359]
eval_df = abt_df[abt_df.abt_type == 'Evaluation']
eval_df = eval_df[eval_df.number_of_fields > 0]
eval_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
68855 | Evaluation | 348351 | 494877 | 494877 | 1 | 110 | 2 | 63365dc67919d20f9deb2214 | 13940 |
68859 | Evaluation | 348820 | 494877 | 494877 | 1 | 110 | 2 | 63365dc67919d20f9deb2214 | 13941 |
68863 | Evaluation | 344049 | 494877 | 494877 | 1 | 110 | 2 | 63365dc67919d20f9deb2214 | 13942 |
68867 | Evaluation | 344654 | 494877 | 494877 | 1 | 110 | 2 | 63365dc67919d20f9deb2214 | 13943 |
68871 | Evaluation | 347065 | 494877 | 494877 | 1 | 110 | 2 | 63365dc67919d20f9deb2214 | 13944 |
eval_df.number_of_fields.unique()
array([ 2, 10])
exp.print_trees(df, eval_df)
SBE stage: project, plaNodeId: 2, totalExecutionTime: 681770, nReturned: 494877, nProcessed: 494877 | stage: filter, plaNodeId: 1, totalExecutionTime: 333419, nReturned: 494877, nProcessed: 1000000 | | stage: scan, plaNodeId: 0, totalExecutionTime: 2389, nReturned: 1000000, nProcessed: 1000000 ABT nodeType: Root, plaNodeId: 3 | nodeType: Evaluation, plaNodeId: 2 | | nodeType: Filter, plaNodeId: 1 | | | nodeType: PhysicalScan, plaNodeId: 0
eval_df.describe()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
count | 1.530000e+03 | 1.530000e+03 | 1.530000e+03 | 1530.000000 | 1530.000000 | 1530.000000 | 1530.000000 |
mean | 2.096814e+05 | 7.762058e+06 | 7.762058e+06 | 0.500000 | 87.705882 | 6.000000 | 14789.492810 |
std | 3.683440e+05 | 8.449659e+06 | 8.449659e+06 | 0.500163 | 22.304124 | 4.001308 | 490.947636 |
min | 3.319000e+03 | 4.466000e+03 | 4.466000e+03 | 0.000000 | 65.000000 | 2.000000 | 13940.000000 |
25% | 1.581975e+04 | 2.597320e+05 | 2.597320e+05 | 0.000000 | 65.000000 | 2.000000 | 14365.250000 |
50% | 2.870100e+04 | 3.737672e+06 | 3.737672e+06 | 0.500000 | 88.000000 | 6.000000 | 14789.500000 |
75% | 2.244522e+05 | 1.500806e+07 | 1.500806e+07 | 1.000000 | 110.000000 | 10.000000 | 15213.750000 |
max | 1.729344e+06 | 2.499542e+07 | 2.499542e+07 | 1.000000 | 110.000000 | 10.000000 | 15638.000000 |
eval_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
execution_time | 1.000000 | -0.383822 | -0.383822 | 0.511727 | 0.511709 | -0.511727 | -0.340920 |
n_returned | -0.383822 | 1.000000 | 1.000000 | -0.856928 | -0.854474 | 0.856928 | 0.928525 |
n_processed | -0.383822 | 1.000000 | 1.000000 | -0.856928 | -0.854474 | 0.856928 | 0.928525 |
keys_length_in_bytes | 0.511727 | -0.856928 | -0.856928 | 1.000000 | 0.999878 | -1.000000 | -0.866017 |
average_document_size_in_bytes | 0.511709 | -0.854474 | -0.854474 | 0.999878 | 1.000000 | -0.999878 | -0.864298 |
number_of_fields | -0.511727 | 0.856928 | 0.856928 | -1.000000 | -0.999878 | 1.000000 | 0.866017 |
source | -0.340920 | 0.928525 | 0.928525 | -0.866017 | -0.864298 | 0.866017 | 1.000000 |
sns.scatterplot(x=eval_df['n_returned'], y=eval_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>
y = eval_df['execution_time']
X = eval_df[['n_processed', 'keys_length_in_bytes']]
X = sm.add_constant(X)
eval_lm = sm.OLS(y, X).fit()
eval_lm.summary()
Dep. Variable: | execution_time | R-squared: | 0.273 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.272 |
Method: | Least Squares | F-statistic: | 286.9 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 1.68e-106 |
Time: | 10:59:00 | Log-Likelihood: | -21536. |
No. Observations: | 1530 | AIC: | 4.308e+04 |
Df Residuals: | 1527 | BIC: | 4.309e+04 |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -1.134e+05 | 2.99e+04 | -3.789 | 0.000 | -1.72e+05 | -5.47e+04 |
n_processed | 0.0090 | 0.002 | 4.863 | 0.000 | 0.005 | 0.013 |
keys_length_in_bytes | 5.068e+05 | 3.12e+04 | 16.257 | 0.000 | 4.46e+05 | 5.68e+05 |
Omnibus: | 540.865 | Durbin-Watson: | 0.182 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 1874.829 |
Skew: | 1.743 | Prob(JB): | 0.00 |
Kurtosis: | 7.154 | Cond. No. | 6.06e+07 |
y_pred = eval_lm.predict(X)
sns.scatterplot(x=eval_df['n_processed'], y=eval_df['execution_time'])
sns.lineplot(x=eval_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = eval_df['execution_time']
X = eval_df[['n_processed']]
X = sm.add_constant(X)
eval_glm = sm.GLM(y, X).fit_constrained(([1, 0], 0))
eval_glm.summary()
Dep. Variable: | execution_time | No. Observations: | 1530 |
---|---|---|---|
Model: | GLM | Df Residuals: | 1529 |
Model Family: | Gaussian | Df Model: | 0 |
Link Function: | identity | Scale: | 1.7824e+11 |
Method: | IRLS | Log-Likelihood: | -21989. |
Date: | Fri, 30 Sep 2022 | Deviance: | 2.7253e+14 |
Time: | 10:59:02 | Pearson chi2: | 2.73e+14 |
No. Iterations: | 1 | Pseudo R-squ. (CS): | -0.2695 |
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 0 | 0 | nan | nan | 0 | 0 |
n_processed | 0.0033 | 0.001 | 3.503 | 0.000 | 0.001 | 0.005 |
y_pred = eval_glm.predict(X)
sns.scatterplot(x=eval_df['n_processed'], y=eval_df['execution_time'])
sns.lineplot(x=eval_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
exp.calibrate(eval_df, ['n_returned', 'number_of_fields'])
R2: 0.0 Coefficients: [209681.42810457 0. 0. ]
exp.calibrate(eval_df, ['n_returned'])
R2: -2.220446049250313e-16 Coefficients: [209681.42810457 0. ]
eval_unwind_df = eval_df[eval_df.number_of_fields == 10]
eval_unwind_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
73123 | Evaluation | 9996 | 4999248 | 4999248 | 0 | 65 | 10 | 633664ec7919d20f9deb25bb | 14791 |
73127 | Evaluation | 6521 | 4999248 | 4999248 | 0 | 65 | 10 | 633664ec7919d20f9deb25bb | 14792 |
73131 | Evaluation | 6387 | 4999248 | 4999248 | 0 | 65 | 10 | 633664ec7919d20f9deb25bb | 14793 |
73135 | Evaluation | 6455 | 4999248 | 4999248 | 0 | 65 | 10 | 633664ec7919d20f9deb25bb | 14794 |
73139 | Evaluation | 6498 | 4999248 | 4999248 | 0 | 65 | 10 | 633664ec7919d20f9deb25bb | 14795 |
exp.print_trees(df, eval_unwind_df)
SBE stage: project, plaNodeId: 2, totalExecutionTime: 15251, nReturned: 4999248, nProcessed: 4999248 | stage: unwind, plaNodeId: 1, totalExecutionTime: 5255, nReturned: 4999248, nProcessed: 1000000 | | stage: scan, plaNodeId: 0, totalExecutionTime: 2682, nReturned: 1000000, nProcessed: 1000000 ABT nodeType: Root, plaNodeId: 3 | nodeType: Evaluation, plaNodeId: 2 | | nodeType: Unwind, plaNodeId: 1 | | | nodeType: PhysicalScan, plaNodeId: 0
exp.print_explain(df, eval_unwind_df)
{ "explainVersion": "2", "queryPlanner": { "namespace": "abt_calibration_big.c_arr_01_1000000", "indexFilterSet": false, "optimizedPipeline": true, "maxIndexedOrSolutionsReached": false, "maxIndexedAndSolutionsReached": false, "maxScansToExplodeReached": false, "winningPlan": { "optimizerPlan": { "nodeType": "Root", "properties": { "cost": 20900.000002, "localCost": 0.0, "adjustedCE": 10000000.0, "planNodeID": 3, "logicalProperties": { "cardinalityEstimate": [ { "ce": 10000000.0 } ], "projections": [ "embedProj_0", "scan_0", "unwoundPid_0", "unwoundProj_0" ], "collectionAvailability": [ "c_arr_01_1000000_fc452517-8329-417c-93b1-28e04369853c" ] } }, "projections": [ "embedProj_0" ], "references": [ { "nodeType": "Variable", "name": "embedProj_0" } ], "child": { "nodeType": "Evaluation", "properties": { "cost": 20900.000002, "localCost": 20000.000001, "adjustedCE": 10000000.0, "planNodeID": 2, "logicalProperties": { "cardinalityEstimate": [ { "ce": 10000000.0 } ], "projections": [ "embedProj_0", "scan_0", "unwoundPid_0", "unwoundProj_0" ], "collectionAvailability": [ "c_arr_01_1000000_fc452517-8329-417c-93b1-28e04369853c" ] }, "physicalProperties": { "projections": [ "embedProj_0" ] } }, "projection": { "embedProj_0": { "nodeType": "If", "condition": { "nodeType": "BinaryOp", "op": "Or", "left": { "nodeType": "FunctionCall", "name": "exists", "arguments": [ { "nodeType": "Variable", "name": "unwoundProj_0" } ] }, "right": { "nodeType": "FunctionCall", "name": "isObject", "arguments": [ { "nodeType": "Variable", "name": "scan_0" } ] } }, "then": { "nodeType": "FunctionCall", "name": "setField", "arguments": [ { "nodeType": "Variable", "name": "scan_0" }, { "nodeType": "Const", "value": "as" }, { "nodeType": "Variable", "name": "unwoundProj_0" } ] }, "else": { "nodeType": "Variable", "name": "scan_0" } } }, "child": { "nodeType": "Unwind", "properties": { "cost": 900.000001, "localCost": 300.0, "adjustedCE": 10000000.0, "planNodeID": 1, "logicalProperties": { "cardinalityEstimate": [ { "ce": 10000000.0 } ], "projections": [ "scan_0", "unwoundPid_0", "unwoundProj_0" ], "collectionAvailability": [ "c_arr_01_1000000_fc452517-8329-417c-93b1-28e04369853c" ] }, "physicalProperties": { "projections": [ "scan_0", "unwoundProj_0" ] } }, "retainNonArrays": false, "bind": { "unwoundPid_0": { "nodeType": "Source" }, "unwoundProj_0": { "nodeType": "Source" } }, "child": { "nodeType": "PhysicalScan", "properties": { "cost": 600.000001, "localCost": 600.000001, "adjustedCE": 1000000.0, "planNodeID": 0, "logicalProperties": { "cardinalityEstimate": [ { "ce": 1000000.0 }, { "requirementCEs": [ { "refProjection": "scan_0", "path": { "nodeType": "PathGet", "path": "as", "input": { "nodeType": "PathIdentity" } }, "ce": 1000000.0 } ] } ], "projections": [ "scan_0", "unwoundProj_0" ], "indexingAvailability": { "groupId": 0, "scanProjection": "scan_0", "scanDefName": "c_arr_01_1000000_fc452517-8329-417c-93b1-28e04369853c", "eqPredsOnly": false }, "collectionAvailability": [ "c_arr_01_1000000_fc452517-8329-417c-93b1-28e04369853c" ] }, "physicalProperties": { "projections": [ "scan_0", "unwoundProj_0" ], "indexingRequirement": { "target": "Complete", "dedupRID": true } } }, "fieldProjectionMap": { "<root>": "scan_0", "as": "unwoundProj_0" }, "scanDefName": "c_arr_01_1000000_fc452517-8329-417c-93b1-28e04369853c", "parallel": false, "bindings": { "scan_0": { "nodeType": "Source" }, "unwoundProj_0": { "nodeType": "Source" } } } } } }, "slotBasedPlan": { "slots": "$$RESULT=s5 env: { }", "stages": "[2] project [s5 = \n if (exists(s3) || isObject(s1)) \n then setField(s1, \"as\", s3) \n else s1 \n] \n[1] unwind s3 s4 s2 false \n[0] scan s1 none none none none none [s2 = as] @\"fc452517-8329-417c-93b1-28e04369853c\" true false " } }, "rejectedPlans": [] }, "executionStats": { "executionSuccess": true, "nReturned": 4999248, "executionTimeMillis": 3213, "totalKeysExamined": 0, "totalDocsExamined": 1000000, "executionStages": { "stage": "project", "planNodeId": 2, "nReturned": 4999248, "executionTimeMillisEstimate": 15, "executionTimeMicros": 15251, "opens": 1, "closes": 1, "saveState": 0, "restoreState": 0, "isEOF": 1, "projections": { "5": "\n if (exists(s3) || isObject(s1)) \n then setField(s1, \"as\", s3) \n else s1 \n" }, "inputStage": { "stage": "unwind", "planNodeId": 1, "nReturned": 4999248, "executionTimeMillisEstimate": 5, "executionTimeMicros": 5255, "opens": 1, "closes": 1, "saveState": 0, "restoreState": 0, "isEOF": 1, "inputSlot": 2, "outSlot": 3, "outIndexSlot": 4, "preserveNullAndEmptyArrays": 0, "inputStage": { "stage": "scan", "planNodeId": 0, "nReturned": 1000000, "executionTimeMillisEstimate": 2, "executionTimeMicros": 2682, "opens": 1, "closes": 1, "saveState": 0, "restoreState": 0, "isEOF": 1, "numReads": 1000000, "recordSlot": 1, "fields": [ "as" ], "outputSlots": [ 2 ] } } } }, "command": { "aggregate": "c_arr_01_1000000", "pipeline": [ { "$unwind": "$as" } ], "cursor": {}, "$db": "abt_calibration_big" }, "serverInfo": { "host": "ip-10-122-6-29", "port": 27017, "version": "6.2.0-alpha", "gitVersion": "unknown" }, "serverParameters": { "internalQueryFacetBufferSizeBytes": 104857600, "internalQueryFacetMaxOutputDocSizeBytes": 104857600, "internalLookupStageIntermediateDocumentMaxSizeBytes": 104857600, "internalDocumentSourceGroupMaxMemoryBytes": 104857600, "internalQueryMaxBlockingSortMemoryUsageBytes": 104857600, "internalQueryProhibitBlockingMergeOnMongoS": 0, "internalQueryMaxAddToSetBytes": 104857600, "internalDocumentSourceSetWindowFieldsMaxMemoryBytes": 104857600 }, "ok": 1.0 }
eval_unwind_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
execution_time | 1.000000 | 0.910525 | 0.910525 | NaN | 0.136833 | NaN | 0.910781 |
n_returned | 0.910525 | 1.000000 | 1.000000 | NaN | 0.293555 | NaN | 0.998277 |
n_processed | 0.910525 | 1.000000 | 1.000000 | NaN | 0.293555 | NaN | 0.998277 |
keys_length_in_bytes | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
average_document_size_in_bytes | 0.136833 | 0.293555 | 0.293555 | NaN | 1.000000 | NaN | 0.292365 |
number_of_fields | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
source | 0.910781 | 0.998277 | 0.998277 | NaN | 0.292365 | NaN | 1.000000 |
sns.scatterplot(x=eval_unwind_df['n_returned'], y=eval_unwind_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>
exp.calibrate(eval_unwind_df, ['n_returned'])
R2: 0.8197887625374135 Coefficients: [0. 0.00144283]
eval_sum_df = eval_df[eval_df.number_of_fields == 2]
eval_sum_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
68855 | Evaluation | 348351 | 494877 | 494877 | 1 | 110 | 2 | 63365dc67919d20f9deb2214 | 13940 |
68859 | Evaluation | 348820 | 494877 | 494877 | 1 | 110 | 2 | 63365dc67919d20f9deb2214 | 13941 |
68863 | Evaluation | 344049 | 494877 | 494877 | 1 | 110 | 2 | 63365dc67919d20f9deb2214 | 13942 |
68867 | Evaluation | 344654 | 494877 | 494877 | 1 | 110 | 2 | 63365dc67919d20f9deb2214 | 13943 |
68871 | Evaluation | 347065 | 494877 | 494877 | 1 | 110 | 2 | 63365dc67919d20f9deb2214 | 13944 |
exp.print_trees(df, eval_sum_df)
SBE stage: project, plaNodeId: 2, totalExecutionTime: 681770, nReturned: 494877, nProcessed: 494877 | stage: filter, plaNodeId: 1, totalExecutionTime: 333419, nReturned: 494877, nProcessed: 1000000 | | stage: scan, plaNodeId: 0, totalExecutionTime: 2389, nReturned: 1000000, nProcessed: 1000000 ABT nodeType: Root, plaNodeId: 3 | nodeType: Evaluation, plaNodeId: 2 | | nodeType: Filter, plaNodeId: 1 | | | nodeType: PhysicalScan, plaNodeId: 0
exp.print_explain(df, eval_sum_df)
{ "explainVersion": "2", "queryPlanner": { "namespace": "abt_calibration_big.c_int_05_1000000", "indexFilterSet": false, "optimizedPipeline": true, "maxIndexedOrSolutionsReached": false, "maxIndexedAndSolutionsReached": false, "maxScansToExplodeReached": false, "winningPlan": { "optimizerPlan": { "nodeType": "Root", "properties": { "cost": 1762.000003, "localCost": 0.0, "adjustedCE": 481000.0, "planNodeID": 3, "logicalProperties": { "cardinalityEstimate": [ { "ce": 481000.0 } ], "projections": [ "combinedProjection_0", "scan_0" ], "indexingAvailability": { "groupId": 0, "scanProjection": "scan_0", "scanDefName": "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192", "eqPredsOnly": false }, "collectionAvailability": [ "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192" ] }, "physicalProperties": { "indexingRequirement": { "target": "Complete", "dedupRID": true } } }, "projections": [ "combinedProjection_0" ], "references": [ { "nodeType": "Variable", "name": "combinedProjection_0" } ], "child": { "nodeType": "Evaluation", "properties": { "cost": 1762.000003, "localCost": 962.000001, "adjustedCE": 481000.0, "planNodeID": 2, "logicalProperties": { "cardinalityEstimate": [ { "ce": 481000.0 } ], "projections": [ "combinedProjection_0", "scan_0" ], "indexingAvailability": { "groupId": 0, "scanProjection": "scan_0", "scanDefName": "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192", "eqPredsOnly": false }, "collectionAvailability": [ "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192" ] }, "physicalProperties": { "projections": [ "combinedProjection_0" ], "indexingRequirement": { "target": "Complete", "dedupRID": true } } }, "projection": { "combinedProjection_0": { "nodeType": "Let", "variable": "valDefault_0", "bind": { "nodeType": "Let", "variable": "inputField_0", "bind": { "nodeType": "If", "condition": { "nodeType": "FunctionCall", "name": "isObject", "arguments": [ { "nodeType": "Variable", "name": "scan_0" } ] }, "then": { "nodeType": "FunctionCall", "name": "keepFields", "arguments": [ { "nodeType": "Variable", "name": "scan_0" }, { "nodeType": "Const", "value": "_id" }, { "nodeType": "Const", "value": "in3" } ] }, "else": { "nodeType": "Variable", "name": "scan_0" } }, "expression": { "nodeType": "Let", "variable": "valField_0", "bind": { "nodeType": "BinaryOp", "op": "Add", "left": { "nodeType": "FunctionCall", "name": "getField", "arguments": [ { "nodeType": "Variable", "name": "scan_0" }, { "nodeType": "Const", "value": "in1" } ] }, "right": { "nodeType": "FunctionCall", "name": "getField", "arguments": [ { "nodeType": "Variable", "name": "scan_0" }, { "nodeType": "Const", "value": "in2" } ] } }, "expression": { "nodeType": "If", "condition": { "nodeType": "BinaryOp", "op": "Or", "left": { "nodeType": "FunctionCall", "name": "exists", "arguments": [ { "nodeType": "Variable", "name": "valField_0" } ] }, "right": { "nodeType": "FunctionCall", "name": "isObject", "arguments": [ { "nodeType": "Variable", "name": "inputField_0" } ] } }, "then": { "nodeType": "FunctionCall", "name": "setField", "arguments": [ { "nodeType": "Variable", "name": "inputField_0" }, { "nodeType": "Const", "value": "in3" }, { "nodeType": "Variable", "name": "valField_0" } ] }, "else": { "nodeType": "Variable", "name": "inputField_0" } } } }, "expression": { "nodeType": "If", "condition": { "nodeType": "FunctionCall", "name": "exists", "arguments": [ { "nodeType": "Variable", "name": "valDefault_0" } ] }, "then": { "nodeType": "Variable", "name": "valDefault_0" }, "else": { "nodeType": "Const", "value": {} } } } }, "child": { "nodeType": "Filter", "properties": { "cost": 800.000002, "localCost": 800.000002, "adjustedCE": 1000000.0, "planNodeID": 1, "logicalProperties": { "cardinalityEstimate": [ { "ce": 481000.0 }, { "requirementCEs": [ { "refProjection": "scan_0", "path": { "nodeType": "PathGet", "path": "in1", "input": { "nodeType": "PathIdentity" } }, "ce": 481000.0 } ] } ], "projections": [ "scan_0" ], "indexingAvailability": { "groupId": 0, "scanProjection": "scan_0", "scanDefName": "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192", "eqPredsOnly": false }, "collectionAvailability": [ "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192" ] }, "physicalProperties": { "projections": [ "scan_0" ], "indexingRequirement": { "target": "Complete", "dedupRID": true } } }, "filter": { "nodeType": "If", "condition": { "nodeType": "FunctionCall", "name": "fillEmpty", "arguments": [ { "nodeType": "BinaryOp", "op": "Gt", "left": { "nodeType": "BinaryOp", "op": "Cmp3w", "left": { "nodeType": "Variable", "name": "evalTemp_0" }, "right": { "nodeType": "Const", "value": 500 } }, "right": { "nodeType": "Const", "value": 0 } }, { "nodeType": "Const", "value": false } ] }, "then": { "nodeType": "BinaryOp", "op": "Lt", "left": { "nodeType": "BinaryOp", "op": "Cmp3w", "left": { "nodeType": "Variable", "name": "evalTemp_0" }, "right": { "nodeType": "Const", "value": "" } }, "right": { "nodeType": "Const", "value": 0 } }, "else": { "nodeType": "Const", "value": false } }, "child": { "nodeType": "PhysicalScan", "properties": { "cost": 800.000002, "localCost": 800.000002, "adjustedCE": 1000000.0, "planNodeID": 0, "logicalProperties": { "cardinalityEstimate": [ { "ce": 481000.0 }, { "requirementCEs": [ { "refProjection": "scan_0", "path": { "nodeType": "PathGet", "path": "in1", "input": { "nodeType": "PathIdentity" } }, "ce": 481000.0 } ] } ], "projections": [ "scan_0" ], "indexingAvailability": { "groupId": 0, "scanProjection": "scan_0", "scanDefName": "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192", "eqPredsOnly": false }, "collectionAvailability": [ "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192" ] }, "physicalProperties": { "projections": [ "scan_0" ], "indexingRequirement": { "target": "Complete", "dedupRID": true } } }, "fieldProjectionMap": { "<root>": "scan_0", "in1": "evalTemp_0" }, "scanDefName": "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192", "parallel": false, "bindings": { "evalTemp_0": { "nodeType": "Source" }, "scan_0": { "nodeType": "Source" } } } } } }, "slotBasedPlan": { "slots": "$$RESULT=s3 env: { }", "stages": "[2] project [s3 = \n let [\n l101.0 = \n let [\n l102.0 = \n if isObject(s1) \n then keepFields(s1, \"_id\", \"in3\") \n else s1 \n ] \n in \n let [\n l103.0 = (getField(s1, \"in1\") + getField(s1, \"in2\")) \n ] \n in \n if (exists(l103.0) || isObject(l102.0)) \n then setField(l102.0, \"in3\", l103.0) \n else l102.0 \n ] \n in \n if exists(l101.0) \n then l101.0 \n else {} \n] \n[1] filter {\n if fillEmpty(((s2 <=> 500) > 0), false) \n then ((s2 <=> \"\") < 0) \n else false \n} \n[0] scan s1 none none none none none [s2 = in1] @\"50cdb415-9a4f-45c8-a6bf-4cf991418192\" true false " } }, "rejectedPlans": [] }, "executionStats": { "executionSuccess": true, "nReturned": 494877, "executionTimeMillis": 1054, "totalKeysExamined": 0, "totalDocsExamined": 1000000, "executionStages": { "stage": "project", "planNodeId": 2, "nReturned": 494877, "executionTimeMillisEstimate": 681, "executionTimeMicros": 681770, "opens": 1, "closes": 1, "saveState": 0, "restoreState": 0, "isEOF": 1, "projections": { "3": "\n let [\n l101.0 = \n let [\n l102.0 = \n if isObject(s1) \n then keepFields(s1, \"_id\", \"in3\") \n else s1 \n ] \n in \n let [\n l103.0 = (getField(s1, \"in1\") + getField(s1, \"in2\")) \n ] \n in \n if (exists(l103.0) || isObject(l102.0)) \n then setField(l102.0, \"in3\", l103.0) \n else l102.0 \n ] \n in \n if exists(l101.0) \n then l101.0 \n else {} \n" }, "inputStage": { "stage": "filter", "planNodeId": 1, "nReturned": 494877, "executionTimeMillisEstimate": 333, "executionTimeMicros": 333419, "opens": 1, "closes": 1, "saveState": 0, "restoreState": 0, "isEOF": 1, "numTested": 1000000, "filter": "\n if fillEmpty(((s2 <=> 500) > 0), false) \n then ((s2 <=> \"\") < 0) \n else false \n", "inputStage": { "stage": "scan", "planNodeId": 0, "nReturned": 1000000, "executionTimeMillisEstimate": 2, "executionTimeMicros": 2389, "opens": 1, "closes": 1, "saveState": 0, "restoreState": 0, "isEOF": 1, "numReads": 1000000, "recordSlot": 1, "fields": [ "in1" ], "outputSlots": [ 2 ] } } } }, "command": { "aggregate": "c_int_05_1000000", "pipeline": [ { "$match": { "in1": { "$gt": 500 } } }, { "$project": { "in3": { "$add": [ "$in1", "$in2" ] } } } ], "cursor": {}, "$db": "abt_calibration_big" }, "serverInfo": { "host": "ip-10-122-6-29", "port": 27017, "version": "6.2.0-alpha", "gitVersion": "unknown" }, "serverParameters": { "internalQueryFacetBufferSizeBytes": 104857600, "internalQueryFacetMaxOutputDocSizeBytes": 104857600, "internalLookupStageIntermediateDocumentMaxSizeBytes": 104857600, "internalDocumentSourceGroupMaxMemoryBytes": 104857600, "internalQueryMaxBlockingSortMemoryUsageBytes": 104857600, "internalQueryProhibitBlockingMergeOnMongoS": 0, "internalQueryMaxAddToSetBytes": 104857600, "internalDocumentSourceSetWindowFieldsMaxMemoryBytes": 104857600 }, "ok": 1.0 }
eval_sum_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
execution_time | 1.000000 | 0.995619 | 0.995619 | NaN | NaN | NaN | 0.314770 |
n_returned | 0.995619 | 1.000000 | 1.000000 | NaN | NaN | NaN | 0.292337 |
n_processed | 0.995619 | 1.000000 | 1.000000 | NaN | NaN | NaN | 0.292337 |
keys_length_in_bytes | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
average_document_size_in_bytes | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
number_of_fields | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
source | 0.314770 | 0.292337 | 0.292337 | NaN | NaN | NaN | 1.000000 |
sns.scatterplot(x=eval_sum_df['n_returned'], y=eval_sum_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>
exp.calibrate(eval_sum_df)
R2: 0.9912578829453941 Coefficients: [2.75194138e+04 7.07673319e-01]
unwind_df = abt_df[abt_df.abt_type == 'Unwind']
unwind_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
73126 | Unwind | 2573 | 4999248 | 1000000 | 0 | 65 | 10 | 633664ec7919d20f9deb25bb | 14791 |
73130 | Unwind | 2119 | 4999248 | 1000000 | 0 | 65 | 10 | 633664ec7919d20f9deb25bb | 14792 |
73134 | Unwind | 2142 | 4999248 | 1000000 | 0 | 65 | 10 | 633664ec7919d20f9deb25bb | 14793 |
73138 | Unwind | 2208 | 4999248 | 1000000 | 0 | 65 | 10 | 633664ec7919d20f9deb25bb | 14794 |
73142 | Unwind | 2109 | 4999248 | 1000000 | 0 | 65 | 10 | 633664ec7919d20f9deb25bb | 14795 |
unwind_df.describe()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
count | 765.000000 | 7.650000e+02 | 7.650000e+02 | 765.0 | 765.000000 | 765.0 | 765.000000 |
mean | 6236.175163 | 1.500044e+07 | 3.000000e+06 | 0.0 | 65.411765 | 10.0 | 15214.522876 |
std | 2949.487220 | 6.129025e+06 | 1.225546e+06 | 0.0 | 0.492475 | 0.0 | 245.551487 |
min | 1857.000000 | 4.999248e+06 | 1.000000e+06 | 0.0 | 65.000000 | 10.0 | 14791.000000 |
25% | 4097.000000 | 9.995953e+06 | 2.000000e+06 | 0.0 | 65.000000 | 10.0 | 15002.000000 |
50% | 5873.000000 | 1.500806e+07 | 3.000000e+06 | 0.0 | 65.000000 | 10.0 | 15214.000000 |
75% | 8048.000000 | 2.000739e+07 | 4.000000e+06 | 0.0 | 66.000000 | 10.0 | 15427.000000 |
max | 28858.000000 | 2.499542e+07 | 5.000000e+06 | 0.0 | 66.000000 | 10.0 | 15638.000000 |
unwind_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
execution_time | 1.000000 | 0.857928 | 0.858001 | NaN | 0.169065 | NaN | 0.859299 |
n_returned | 0.857928 | 1.000000 | 1.000000 | NaN | 0.293555 | NaN | 0.998277 |
n_processed | 0.858001 | 1.000000 | 1.000000 | NaN | 0.292770 | NaN | 0.998277 |
keys_length_in_bytes | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
average_document_size_in_bytes | 0.169065 | 0.293555 | 0.292770 | NaN | 1.000000 | NaN | 0.292365 |
number_of_fields | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
source | 0.859299 | 0.998277 | 0.998277 | NaN | 0.292365 | NaN | 1.000000 |
sns.scatterplot(x=unwind_df['n_returned'], y=unwind_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>
sns.scatterplot(x=unwind_df['n_processed'], y=unwind_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = unwind_df['execution_time']
X = unwind_df[['n_returned']]
X = sm.add_constant(X)
unwind_lm = sm.OLS(y, X).fit()
unwind_lm.summary()
Dep. Variable: | execution_time | R-squared: | 0.736 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.736 |
Method: | Least Squares | F-statistic: | 2128. |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 6.97e-223 |
Time: | 10:59:10 | Log-Likelihood: | -6687.4 |
No. Observations: | 765 | AIC: | 1.338e+04 |
Df Residuals: | 763 | BIC: | 1.339e+04 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 43.0501 | 145.027 | 0.297 | 0.767 | -241.650 | 327.750 |
n_returned | 0.0004 | 8.95e-06 | 46.126 | 0.000 | 0.000 | 0.000 |
Omnibus: | 1214.149 | Durbin-Watson: | 0.554 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 386396.419 |
Skew: | 9.578 | Prob(JB): | 0.00 |
Kurtosis: | 111.422 | Cond. No. | 4.29e+07 |
y_pred = unwind_lm.predict(X)
sns.scatterplot(x=unwind_df['n_processed'], y=unwind_df['execution_time'])
sns.lineplot(x=unwind_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
exp.calibrate(unwind_df)
R2: 0.7361659234382227 Coefficients: [4.13941176e+01 2.06492702e-03]
scan_df = abt_df[abt_df.abt_type == 'PhysicalScan']
scan_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
45 | PhysicalScan | 1792 | 0 | 1000000 | 3 | 42 | 0 | 633624647919d20f9deae62f | 10 |
48 | PhysicalScan | 1809 | 0 | 1000000 | 3 | 42 | 0 | 633624647919d20f9deae62f | 11 |
51 | PhysicalScan | 1818 | 0 | 1000000 | 3 | 42 | 0 | 633624647919d20f9deae62f | 13 |
54 | PhysicalScan | 1761 | 0 | 1000000 | 3 | 42 | 0 | 633624647919d20f9deae62f | 14 |
57 | PhysicalScan | 1883 | 0 | 1000000 | 3 | 42 | 0 | 633624647919d20f9deae62f | 15 |
scan_df.describe()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
count | 3987.000000 | 3.987000e+03 | 3.987000e+03 | 3987.000000 | 3987.000000 | 3987.000000 | 3987.000000 |
mean | 6764.166040 | 2.936694e+06 | 2.999686e+06 | 5.798846 | 77.174818 | 1.999498 | 5382.495360 |
std | 3541.646937 | 6.469838e+06 | 1.224764e+06 | 4.271361 | 39.966466 | 3.918513 | 5499.042238 |
min | 1207.000000 | 0.000000e+00 | 1.000000e+06 | 0.000000 | 42.000000 | 0.000000 | 10.000000 |
25% | 4038.500000 | 0.000000e+00 | 2.000000e+06 | 3.000000 | 42.000000 | 0.000000 | 1475.500000 |
50% | 6061.000000 | 0.000000e+00 | 3.000000e+06 | 6.000000 | 65.000000 | 0.000000 | 3074.000000 |
75% | 8717.500000 | 0.000000e+00 | 4.000000e+06 | 9.000000 | 133.000000 | 0.000000 | 5300.500000 |
max | 24517.000000 | 2.499542e+07 | 5.000000e+06 | 12.000000 | 133.000000 | 10.000000 | 15638.000000 |
scan_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
execution_time | 1.000000 | 0.334888 | 0.796007 | -0.125939 | 0.470776 | 0.167759 | 0.366205 |
n_returned | 0.334888 | 1.000000 | 0.185490 | -0.614264 | -0.122880 | 0.908513 | 0.818592 |
n_processed | 0.796007 | 0.185490 | 1.000000 | -0.000096 | 0.000830 | 0.000105 | 0.119858 |
keys_length_in_bytes | -0.125939 | -0.614264 | -0.000096 | 1.000000 | 0.048624 | -0.688095 | -0.697873 |
average_document_size_in_bytes | 0.470776 | -0.122880 | 0.000830 | 0.048624 | 1.000000 | -0.127222 | 0.148642 |
number_of_fields | 0.167759 | 0.908513 | 0.000105 | -0.688095 | -0.127222 | 1.000000 | 0.909306 |
source | 0.366205 | 0.818592 | 0.119858 | -0.697873 | 0.148642 | 0.909306 | 1.000000 |
sns.scatterplot(x=scan_df['n_processed'], y=scan_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
sns.scatterplot(x=scan_df['average_document_size_in_bytes'], y=scan_df['execution_time'], color='blue')
<AxesSubplot:xlabel='average_document_size_in_bytes', ylabel='execution_time'>
y = scan_df['execution_time']
X = scan_df[['n_returned', 'average_document_size_in_bytes']]
X = sm.add_constant(X)
scan_lm = sm.OLS(y, X).fit()
scan_lm.summary()
Dep. Variable: | execution_time | R-squared: | 0.378 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.378 |
Method: | Least Squares | F-statistic: | 1212. |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 10:59:11 | Log-Likelihood: | -37293. |
No. Observations: | 3987 | AIC: | 7.459e+04 |
Df Residuals: | 3984 | BIC: | 7.461e+04 |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 2568.4513 | 101.024 | 25.424 | 0.000 | 2370.388 | 2766.515 |
n_returned | 0.0002 | 6.89e-06 | 31.678 | 0.000 | 0.000 | 0.000 |
average_document_size_in_bytes | 46.0601 | 1.115 | 41.291 | 0.000 | 43.873 | 48.247 |
Omnibus: | 30.322 | Durbin-Watson: | 0.038 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 30.813 |
Skew: | 0.213 | Prob(JB): | 2.04e-07 |
Kurtosis: | 3.058 | Cond. No. | 1.62e+07 |
y_pred = scan_lm.predict(X)
sns.scatterplot(x=scan_df['n_processed'], y=scan_df['execution_time'])
sns.lineplot(x=scan_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = scan_df['execution_time']
X = scan_df[['n_processed', 'average_document_size_in_bytes']]
X = sm.add_constant(X)
scan_lm = sm.OLS(y, X).fit()
scan_lm.summary()
Dep. Variable: | execution_time | R-squared: | 0.855 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.855 |
Method: | Least Squares | F-statistic: | 1.171e+04 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 10:59:12 | Log-Likelihood: | -34395. |
No. Observations: | 3987 | AIC: | 6.880e+04 |
Df Residuals: | 3984 | BIC: | 6.882e+04 |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -3352.2193 | 70.042 | -47.860 | 0.000 | -3489.540 | -3214.898 |
n_processed | 0.0023 | 1.75e-05 | 131.714 | 0.000 | 0.002 | 0.002 |
average_document_size_in_bytes | 41.6595 | 0.535 | 77.828 | 0.000 | 40.610 | 42.709 |
Omnibus: | 2056.156 | Durbin-Watson: | 0.125 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 40368.358 |
Skew: | 2.007 | Prob(JB): | 0.00 |
Kurtosis: | 18.063 | Cond. No. | 1.06e+07 |
y_pred = scan_lm.predict(X)
sns.scatterplot(x=scan_df['n_processed'], y=scan_df['execution_time'])
sns.lineplot(x=scan_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = scan_df['execution_time']
X = scan_df[['n_processed', 'average_document_size_in_bytes']]
X = sm.add_constant(X)
scan_glm = sm.GLM(y, X).fit_constrained(([1, 0, 0], 0))
scan_glm.summary()
Dep. Variable: | execution_time | No. Observations: | 3987 |
---|---|---|---|
Model: | GLM | Df Residuals: | 3985 |
Model Family: | Gaussian | Df Model: | 1 |
Link Function: | identity | Scale: | 2.8724e+06 |
Method: | IRLS | Log-Likelihood: | -35301. |
Date: | Fri, 30 Sep 2022 | Deviance: | 1.1447e+10 |
Time: | 10:59:12 | Pearson chi2: | 1.14e+10 |
No. Iterations: | 1 | Pseudo R-squ. (CS): | 0.9655 |
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 0 | 0 | nan | nan | 0 | 0 |
n_processed | 0.0017 | 1.46e-05 | 115.111 | 0.000 | 0.002 | 0.002 |
average_document_size_in_bytes | 26.5657 | 0.543 | 48.949 | 0.000 | 25.502 | 27.629 |
y_pred = scan_glm.predict(X)
sns.scatterplot(x=scan_df['n_processed'], y=scan_df['execution_time'])
sns.lineplot(x=scan_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = scan_df['execution_time']
X = scan_df[['n_processed']]
X = sm.add_constant(X)
scan_glm = sm.GLM(y, X).fit_constrained(([1, 0], 0))
scan_glm.summary()
Dep. Variable: | execution_time | No. Observations: | 3987 |
---|---|---|---|
Model: | GLM | Df Residuals: | 3986 |
Model Family: | Gaussian | Df Model: | 0 |
Link Function: | identity | Scale: | 4.5983e+06 |
Method: | IRLS | Log-Likelihood: | -36240. |
Date: | Fri, 30 Sep 2022 | Deviance: | 1.8329e+10 |
Time: | 10:59:13 | Pearson chi2: | 1.83e+10 |
No. Iterations: | 1 | Pseudo R-squ. (CS): | 0.8222 |
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 0 | 0 | nan | nan | 0 | 0 |
n_processed | 0.0023 | 1.05e-05 | 215.773 | 0.000 | 0.002 | 0.002 |
y_pred = scan_glm.predict(X)
sns.scatterplot(x=scan_df['n_processed'], y=scan_df['execution_time'])
sns.lineplot(x=scan_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
exp.calibrate(scan_df)
R2: 0.6334014132570354 Coefficients: [0. 0.00226165]
ixscan_df = abt_df[abt_df.abt_type == 'IndexScan']
ixscan_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
2 | IndexScan | 157 | 59685 | 59686 | 3 | 42 | 0 | 633624617919d20f9deae62e | 0 |
7 | IndexScan | 182 | 59685 | 59686 | 3 | 42 | 0 | 633624617919d20f9deae62e | 1 |
12 | IndexScan | 176 | 59685 | 59686 | 3 | 42 | 0 | 633624617919d20f9deae62e | 2 |
17 | IndexScan | 181 | 59685 | 59686 | 3 | 42 | 0 | 633624617919d20f9deae62e | 4 |
22 | IndexScan | 127 | 59685 | 59686 | 3 | 42 | 0 | 633624617919d20f9deae62e | 5 |
ixscan_df.describe()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
count | 9317.000000 | 9.317000e+03 | 9.317000e+03 | 9317.000000 | 9317.000000 | 9317.000000 | 9317.000000 |
mean | 183.653322 | 5.237092e+04 | 5.550288e+04 | 2.280884 | 108.554900 | 0.129655 | 8410.227648 |
std | 472.716287 | 1.347079e+05 | 1.336121e+05 | 2.983782 | 19.266072 | 0.492470 | 3546.327369 |
min | 6.000000 | 0.000000e+00 | 1.000000e+00 | 1.000000 | 42.000000 | 0.000000 | 0.000000 |
25% | 14.000000 | 2.000000e+00 | 1.227000e+03 | 1.000000 | 110.000000 | 0.000000 | 5985.000000 |
50% | 35.000000 | 1.120000e+02 | 6.564000e+03 | 1.000000 | 110.000000 | 0.000000 | 8559.000000 |
75% | 95.000000 | 1.486700e+04 | 2.181900e+04 | 1.000000 | 110.000000 | 0.000000 | 11132.000000 |
max | 6692.000000 | 1.268990e+06 | 1.268991e+06 | 12.000000 | 133.000000 | 2.000000 | 14788.000000 |
ixscan_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
execution_time | 1.000000 | 0.946741 | 0.947593 | 0.169726 | -0.025228 | 0.519053 | 0.016673 |
n_returned | 0.946741 | 1.000000 | 0.999371 | 0.268497 | -0.065274 | 0.439198 | -0.115914 |
n_processed | 0.947593 | 0.999371 | 1.000000 | 0.260639 | -0.064051 | 0.436630 | -0.107480 |
keys_length_in_bytes | 0.169726 | 0.268497 | 0.260639 | 1.000000 | -0.131214 | -0.113032 | -0.632664 |
average_document_size_in_bytes | -0.025228 | -0.065274 | -0.064051 | -0.131214 | 1.000000 | 0.019750 | 0.269986 |
number_of_fields | 0.519053 | 0.439198 | 0.436630 | -0.113032 | 0.019750 | 1.000000 | 0.442477 |
source | 0.016673 | -0.115914 | -0.107480 | -0.632664 | 0.269986 | 0.442477 | 1.000000 |
sns.scatterplot(x=ixscan_df['n_processed'], y=ixscan_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
sns.scatterplot(x=ixscan_df['n_returned'], y=ixscan_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>
y = ixscan_df['execution_time']
X = ixscan_df[['n_processed']]
X = sm.add_constant(X)
ixscan_lm = sm.OLS(y, X).fit()
ixscan_lm.summary()
Dep. Variable: | execution_time | R-squared: | 0.898 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.898 |
Method: | Least Squares | F-statistic: | 8.195e+04 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 10:59:14 | Log-Likelihood: | -59967. |
No. Observations: | 9317 | AIC: | 1.199e+05 |
Df Residuals: | 9315 | BIC: | 1.200e+05 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -2.4234 | 1.694 | -1.430 | 0.153 | -5.745 | 0.898 |
n_processed | 0.0034 | 1.17e-05 | 286.265 | 0.000 | 0.003 | 0.003 |
Omnibus: | 15731.955 | Durbin-Watson: | 0.226 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 22486366.365 |
Skew: | 11.343 | Prob(JB): | 0.00 |
Kurtosis: | 242.601 | Cond. No. | 1.57e+05 |
y_pred = ixscan_lm.predict(X)
sns.scatterplot(x=ixscan_df['n_processed'], y=ixscan_df['execution_time'])
sns.lineplot(x=ixscan_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
exp.calibrate(ixscan_df)
R2: 0.897909337334287 Coefficients: [0. 0.00334613]
seek_df = abt_df[abt_df.abt_type == 'Seek']
seek_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
3 | Seek | 2906 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 0 |
8 | Seek | 2810 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 1 |
13 | Seek | 2935 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 2 |
18 | Seek | 2990 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 4 |
23 | Seek | 2817 | 59685 | 59685 | 3 | 42 | 0 | 633624617919d20f9deae62e | 5 |
seek_df.describe()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
count | 1.012700e+04 | 1.012700e+04 | 1.012700e+04 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 |
mean | 1.810673e+04 | 4.818516e+04 | 4.859375e+04 | 2.178434 | 108.670485 | 0.119285 | 8783.434976 |
std | 8.791293e+04 | 1.299854e+05 | 1.298404e+05 | 2.882973 | 18.483604 | 0.473670 | 3632.120204 |
min | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000 | 42.000000 | 0.000000 | 0.000000 |
25% | 7.750000e+01 | 2.000000e+00 | 6.400000e+01 | 1.000000 | 110.000000 | 0.000000 | 6209.500000 |
50% | 1.133000e+03 | 7.500000e+01 | 9.580000e+02 | 1.000000 | 110.000000 | 0.000000 | 9009.000000 |
75% | 7.162000e+03 | 1.139900e+04 | 1.171700e+04 | 1.000000 | 110.000000 | 0.000000 | 11801.500000 |
max | 1.261394e+06 | 1.268990e+06 | 1.268990e+06 | 12.000000 | 133.000000 | 2.000000 | 14788.000000 |
seek_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
execution_time | 1.000000 | 0.778599 | 0.778963 | -0.023600 | 0.022867 | 0.634594 | 0.222285 |
n_returned | 0.778599 | 1.000000 | 0.999955 | 0.278109 | -0.067185 | 0.443475 | -0.145968 |
n_processed | 0.778963 | 0.999955 | 1.000000 | 0.277134 | -0.067034 | 0.443178 | -0.144273 |
keys_length_in_bytes | -0.023600 | 0.278109 | 0.277134 | 1.000000 | -0.132785 | -0.102948 | -0.630184 |
average_document_size_in_bytes | 0.022867 | -0.067185 | -0.067034 | -0.132785 | 1.000000 | 0.018116 | 0.260180 |
number_of_fields | 0.634594 | 0.443475 | 0.443178 | -0.102948 | 0.018116 | 1.000000 | 0.387364 |
source | 0.222285 | -0.145968 | -0.144273 | -0.630184 | 0.260180 | 0.387364 | 1.000000 |
sns.scatterplot(x=seek_df['n_processed'], y=seek_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = seek_df['execution_time']
X = seek_df[['n_processed']]
X = sm.add_constant(X)
seek_lm = sm.OLS(y, X).fit()
seek_lm.summary()
Dep. Variable: | execution_time | R-squared: | 0.607 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.607 |
Method: | Least Squares | F-statistic: | 1.562e+04 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 10:59:40 | Log-Likelihood: | -1.2493e+05 |
No. Observations: | 10127 | AIC: | 2.499e+05 |
Df Residuals: | 10125 | BIC: | 2.499e+05 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -7522.7608 | 584.949 | -12.861 | 0.000 | -8669.377 | -6376.145 |
n_processed | 0.5274 | 0.004 | 124.997 | 0.000 | 0.519 | 0.536 |
Omnibus: | 5426.712 | Durbin-Watson: | 0.098 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 316563.039 |
Skew: | 1.799 | Prob(JB): | 0.00 |
Kurtosis: | 30.153 | Cond. No. | 1.48e+05 |
y_pred = seek_lm.predict(X)
sns.scatterplot(x=seek_df['n_processed'], y=seek_df['execution_time'])
sns.lineplot(x=seek_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
y = seek_df['execution_time']
X = seek_df[['n_processed']]
X = sm.add_constant(X)
seek_glm = sm.GLM(y, X).fit_constrained(([1, 0], 0))
seek_glm.summary()
Dep. Variable: | execution_time | No. Observations: | 10127 |
---|---|---|---|
Model: | GLM | Df Residuals: | 10126 |
Model Family: | Gaussian | Df Model: | 0 |
Link Function: | identity | Scale: | 3.0887e+09 |
Method: | IRLS | Log-Likelihood: | -1.2501e+05 |
Date: | Fri, 30 Sep 2022 | Deviance: | 3.1276e+13 |
Time: | 10:59:53 | Pearson chi2: | 3.13e+13 |
No. Iterations: | 1 | Pseudo R-squ. (CS): | 0.7773 |
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 0 | 0 | nan | nan | 0 | 0 |
n_processed | 0.5084 | 0.004 | 127.619 | 0.000 | 0.501 | 0.516 |
y_pred = seek_glm.predict(X)
sns.scatterplot(x=seek_df['n_processed'], y=seek_df['execution_time'])
sns.lineplot(x=seek_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
exp.calibrate(seek_df)
R2: 0.6003597139964046 Coefficients: [0. 0.50840207]
filter_df = abt_df[abt_df.abt_type == 'Filter']
filter_df.head()
abt_type | execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | run_id | source | |
---|---|---|---|---|---|---|---|---|---|
47 | Filter | 451034 | 0 | 1000000 | 3 | 42 | 0 | 633624647919d20f9deae62f | 10 |
50 | Filter | 450647 | 0 | 1000000 | 3 | 42 | 0 | 633624647919d20f9deae62f | 11 |
53 | Filter | 448703 | 0 | 1000000 | 3 | 42 | 0 | 633624647919d20f9deae62f | 13 |
56 | Filter | 449444 | 0 | 1000000 | 3 | 42 | 0 | 633624647919d20f9deae62f | 14 |
59 | Filter | 450747 | 0 | 1000000 | 3 | 42 | 0 | 633624647919d20f9deae62f | 15 |
filter_df.describe()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
count | 7.349000e+03 | 7.349000e+03 | 7.349000e+03 | 7349.000000 | 7349.000000 | 7349.000000 | 7349.000000 |
mean | 6.219597e+05 | 3.184677e+04 | 1.317927e+06 | 3.707579 | 96.833039 | 0.043815 | 7171.164104 |
std | 8.071342e+05 | 2.308601e+05 | 1.692869e+06 | 3.867332 | 32.721953 | 0.292785 | 4587.832281 |
min | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000 | 42.000000 | 0.000000 | 10.000000 |
25% | 1.100000e+02 | 0.000000e+00 | 8.080000e+02 | 1.000000 | 110.000000 | 0.000000 | 2740.000000 |
50% | 3.190000e+03 | 0.000000e+00 | 1.538000e+04 | 1.000000 | 110.000000 | 0.000000 | 7032.000000 |
75% | 1.241072e+06 | 3.000000e+00 | 2.750000e+06 | 6.000000 | 110.000000 | 0.000000 | 11843.000000 |
max | 2.653928e+06 | 2.476095e+06 | 5.000000e+06 | 12.000000 | 133.000000 | 2.000000 | 14752.000000 |
filter_df.corr()
execution_time | n_returned | n_processed | keys_length_in_bytes | average_document_size_in_bytes | number_of_fields | source | |
---|---|---|---|---|---|---|---|
execution_time | 1.000000 | 0.091529 | 0.991513 | 0.708746 | -0.327601 | 0.070194 | -0.650954 |
n_returned | 0.091529 | 1.000000 | 0.176790 | -0.096593 | 0.055516 | 0.918669 | 0.218267 |
n_processed | 0.991513 | 0.176790 | 1.000000 | 0.695579 | -0.399710 | 0.148579 | -0.643943 |
keys_length_in_bytes | 0.708746 | -0.096593 | 0.695579 | 1.000000 | -0.394758 | -0.104787 | -0.716463 |
average_document_size_in_bytes | -0.327601 | 0.055516 | -0.399710 | -0.394758 | 1.000000 | 0.060226 | 0.578946 |
number_of_fields | 0.070194 | 0.918669 | 0.148579 | -0.104787 | 0.060226 | 1.000000 | 0.234023 |
source | -0.650954 | 0.218267 | -0.643943 | -0.716463 | 0.578946 | 0.234023 | 1.000000 |
sns.scatterplot(x=filter_df['n_processed'], y=filter_df['execution_time'], color='blue')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
sns.scatterplot(x=filter_df['keys_length_in_bytes'], y=filter_df['execution_time'], color='blue')
<AxesSubplot:xlabel='keys_length_in_bytes', ylabel='execution_time'>
y = filter_df['execution_time']
X = filter_df[['n_processed', 'keys_length_in_bytes']]
X = sm.add_constant(X)
filter_lm = sm.OLS(y, X).fit()
filter_lm.summary()
Dep. Variable: | execution_time | R-squared: | 0.984 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.984 |
Method: | Least Squares | F-statistic: | 2.231e+05 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 11:00:20 | Log-Likelihood: | -95233. |
No. Observations: | 7349 | AIC: | 1.905e+05 |
Df Residuals: | 7346 | BIC: | 1.905e+05 |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -1.351e+04 | 1670.649 | -8.090 | 0.000 | -1.68e+04 | -1.02e+04 |
n_processed | 0.4605 | 0.001 | 467.304 | 0.000 | 0.459 | 0.462 |
keys_length_in_bytes | 7710.6187 | 431.349 | 17.876 | 0.000 | 6865.052 | 8556.186 |
Omnibus: | 1744.168 | Durbin-Watson: | 0.015 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 21711.732 |
Skew: | -0.782 | Prob(JB): | 0.00 |
Kurtosis: | 11.274 | Cond. No. | 3.01e+06 |
y = filter_df['execution_time']
X = filter_df[['n_processed']]
X = sm.add_constant(X)
filter_lm = sm.OLS(y, X).fit()
filter_lm.summary()
Dep. Variable: | execution_time | R-squared: | 0.983 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.983 |
Method: | Least Squares | F-statistic: | 4.273e+05 |
Date: | Fri, 30 Sep 2022 | Prob (F-statistic): | 0.00 |
Time: | 11:00:20 | Log-Likelihood: | -95390. |
No. Observations: | 7349 | AIC: | 1.908e+05 |
Df Residuals: | 7347 | BIC: | 1.908e+05 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -1074.9789 | 1551.382 | -0.693 | 0.488 | -4116.133 | 1966.175 |
n_processed | 0.4727 | 0.001 | 653.720 | 0.000 | 0.471 | 0.474 |
Omnibus: | 2654.867 | Durbin-Watson: | 0.012 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 37668.251 |
Skew: | -1.334 | Prob(JB): | 0.00 |
Kurtosis: | 13.766 | Cond. No. | 2.72e+06 |
y = filter_df['execution_time']
X = filter_df[['n_processed']]
X = sm.add_constant(X)
filter_glm = sm.GLM(y, X).fit_constrained(([1, 0], 0))
filter_glm.summary()
Dep. Variable: | execution_time | No. Observations: | 7349 |
---|---|---|---|
Model: | GLM | Df Residuals: | 7348 |
Model Family: | Gaussian | Df Model: | 0 |
Link Function: | identity | Scale: | 1.1011e+10 |
Method: | IRLS | Log-Likelihood: | -95390. |
Date: | Fri, 30 Sep 2022 | Deviance: | 8.0912e+13 |
Time: | 11:00:21 | Pearson chi2: | 8.09e+13 |
No. Iterations: | 1 | Pseudo R-squ. (CS): | 1.000 |
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 0 | 0 | nan | nan | 0 | 0 |
n_processed | 0.4724 | 0.001 | 827.981 | 0.000 | 0.471 | 0.474 |
y_pred = filter_glm.predict(X)
sns.scatterplot(x=filter_df['n_processed'], y=filter_df['execution_time'])
sns.lineplot(x=filter_df['n_processed'],y=y_pred, color='red')
<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>
exp.calibrate(filter_df)
R2: 0.9830974658266592 Coefficients: [0. 0.4724306]