import math
import bson.json_util as json
import seaborn as sns
import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

import sys
sys.path.append('/home/ubuntu/mongo/buildscripts/cost_model')
import experiment as exp
from config import DatabaseConfig
from database_instance import DatabaseInstance


database_config = DatabaseConfig(connection_string='mongodb://localhost',
                                     database_name='abt_calibration_big', dump_path='',
                                     restore_from_dump=False, dump_on_exit=False)
database = DatabaseInstance(database_config)
df = await exp.load_calibration_data(database, 'calibrationData')
df.describe()


noout_df = exp.remove_outliers(df, 0.0, 0.90)
noout_df.describe()


abt_df = exp.extract_abt_nodes(noout_df)
abt_df.head()


abt_df.abt_type.unique()

array(['LimitSkip', 'BinaryJoin', 'IndexScan', 'Seek', 'Root',
       'PhysicalScan', 'Filter', 'MergeJoin', 'Union', 'HashJoin',
       'GroupBy', 'Evaluation', 'Unwind'], dtype=object)


merge_join_df = abt_df[abt_df.abt_type == 'MergeJoin']
merge_join_df.head()


exp.print_trees(noout_df, merge_join_df)

SBE
stage: nlj, plaNodeId: 7, totalExecutionTime: 647, nReturned: 0, nProcessed: 0
| stage: mj, plaNodeId: 4, totalExecutionTime: 647, nReturned: 0, nProcessed: 1252
| | stage: ixseek, plaNodeId: 0, totalExecutionTime: 11, nReturned: 612, nProcessed: 612
| | stage: ixseek, plaNodeId: 1, totalExecutionTime: 3, nReturned: 640, nProcessed: 641
| stage: limitskip, plaNodeId: 6, totalExecutionTime: 0, nReturned: 0, nProcessed: 0
| | stage: seek, plaNodeId: 5, totalExecutionTime: 0, nReturned: 0, nProcessed: 0

ABT
nodeType: Root, plaNodeId: 8
| nodeType: BinaryJoin, plaNodeId: 7
| | nodeType: MergeJoin, plaNodeId: 4
| | | nodeType: IndexScan, plaNodeId: 0
| | | nodeType: Union, plaNodeId: 3
| | nodeType: LimitSkip, plaNodeId: 6
| | | nodeType: Seek, plaNodeId: 5


merge_join_df.describe()


merge_join_df.corr()


sns.scatterplot(x=merge_join_df['n_processed'], y=merge_join_df['execution_time'])

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = merge_join_df['execution_time']
X = merge_join_df[['n_processed']]
X = sm.add_constant(X)
mj_lm = sm.OLS(y, X).fit()
mj_lm.summary()


y_pred = mj_lm.predict(X)
sns.scatterplot(x=merge_join_df['n_processed'], y=merge_join_df['execution_time'])
sns.lineplot(x=merge_join_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = merge_join_df['execution_time']
X = merge_join_df[['n_processed', 'n_returned']]
X = sm.add_constant(X)
mj_lm = sm.OLS(y, X).fit()
mj_lm.summary()


exp.calibrate(merge_join_df)

R2: 0.9997618248163389
Coefficients: [23.75586675  0.48160358]


binary_join_df = abt_df[abt_df.abt_type == 'BinaryJoin']
binary_join_df.head()


exp.print_trees(noout_df, binary_join_df)

SBE
stage: nlj, plaNodeId: 3, totalExecutionTime: 63656, nReturned: 59685, nProcessed: 59685
| stage: ixseek, plaNodeId: 0, totalExecutionTime: 157, nReturned: 59685, nProcessed: 59686
| stage: limitskip, plaNodeId: 2, totalExecutionTime: 5115, nReturned: 59685, nProcessed: 59685
| | stage: seek, plaNodeId: 1, totalExecutionTime: 2906, nReturned: 59685, nProcessed: 59685

ABT
nodeType: Root, plaNodeId: 4
| nodeType: BinaryJoin, plaNodeId: 3
| | nodeType: IndexScan, plaNodeId: 0
| | nodeType: LimitSkip, plaNodeId: 2
| | | nodeType: Seek, plaNodeId: 1


binary_join_df.describe()


binary_join_df.corr()


sns.scatterplot(x=binary_join_df['n_processed'], y=binary_join_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


sns.scatterplot(x=binary_join_df['n_returned'], y=binary_join_df['execution_time'], color='red')

<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>


y = binary_join_df['execution_time']
X = binary_join_df[['n_processed', 'n_returned']]
X = sm.add_constant(X)
bj_lm = sm.OLS(y, X).fit()
bj_lm.summary()


y = binary_join_df['execution_time']
X = binary_join_df[['n_returned']]
X = sm.add_constant(X)
bj_lm = sm.OLS(y, X).fit()
bj_lm.summary()


y_pred = bj_lm.predict(X)
sns.scatterplot(x=binary_join_df['n_returned'], y=binary_join_df['execution_time'])
sns.lineplot(x=binary_join_df['n_returned'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>


y = binary_join_df['execution_time']
X = binary_join_df[['n_processed']]
X = sm.add_constant(X)
bj_lm = sm.OLS(y, X).fit()
bj_lm.summary()


y_pred = bj_lm.predict(X)
sns.scatterplot(x=binary_join_df['n_processed'], y=binary_join_df['execution_time'])
sns.lineplot(x=binary_join_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


exp.calibrate(binary_join_df)

R2: 0.9998340702333239
Coefficients: [0.         0.98244794]


hash_join_df = abt_df[abt_df.abt_type == 'HashJoin']
hash_join_df.head()


exp.print_trees(noout_df, hash_join_df)

SBE
stage: nlj, plaNodeId: 7, totalExecutionTime: 2871, nReturned: 0, nProcessed: 0
| stage: hj, plaNodeId: 4, totalExecutionTime: 2870, nReturned: 0, nProcessed: 4741
| | stage: ixseek, plaNodeId: 1, totalExecutionTime: 52, nReturned: 4568, nProcessed: 4569
| | stage: ixseek, plaNodeId: 0, totalExecutionTime: 6, nReturned: 173, nProcessed: 174
| stage: limitskip, plaNodeId: 6, totalExecutionTime: 0, nReturned: 0, nProcessed: 0
| | stage: seek, plaNodeId: 5, totalExecutionTime: 0, nReturned: 0, nProcessed: 0

ABT
nodeType: Root, plaNodeId: 8
| nodeType: BinaryJoin, plaNodeId: 7
| | nodeType: HashJoin, plaNodeId: 4
| | | nodeType: IndexScan, plaNodeId: 0
| | | nodeType: Union, plaNodeId: 3
| | nodeType: LimitSkip, plaNodeId: 6
| | | nodeType: Seek, plaNodeId: 5


hash_join_df.describe()


hash_join_df.corr()


sns.scatterplot(x=hash_join_df['n_processed'], y=hash_join_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = hash_join_df['execution_time']
X = hash_join_df[['n_processed', 'n_returned']]
X = sm.add_constant(X)
hj_lm = sm.OLS(y, X).fit()
hj_lm.summary()


y = hash_join_df['execution_time']
X = hash_join_df[['n_processed']]
X = sm.add_constant(X)
hj_lm = sm.OLS(y, X).fit()
hj_lm.summary()


y_pred = hj_lm.predict(X)
sns.scatterplot(x=hash_join_df['n_processed'], y=hash_join_df['execution_time'])
sns.lineplot(x=hash_join_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


exp.calibrate(hash_join_df)

R2: 0.9887787330156012
Coefficients: [0.         1.26765332]


union_df = abt_df[abt_df.abt_type == 'Union']
union_df = union_df[union_df.execution_time > 0]
union_df.head()


exp.print_trees(noout_df, union_df)

SBE
stage: nlj, plaNodeId: 9, totalExecutionTime: 1482, nReturned: 0, nProcessed: 0
| stage: filter, plaNodeId: 6, totalExecutionTime: 1479, nReturned: 0, nProcessed: 1252
| | stage: group, plaNodeId: 5, totalExecutionTime: 1292, nReturned: 1252, nProcessed: 1252
| | | stage: union, plaNodeId: 4, totalExecutionTime: 23, nReturned: 1252, nProcessed: 1252
| | | | stage: project, plaNodeId: 1, totalExecutionTime: 12, nReturned: 612, nProcessed: 612
| | | | | stage: ixseek, plaNodeId: 0, totalExecutionTime: 11, nReturned: 612, nProcessed: 613
| | | | stage: project, plaNodeId: 3, totalExecutionTime: 5, nReturned: 640, nProcessed: 640
| | | | | stage: ixseek, plaNodeId: 2, totalExecutionTime: 4, nReturned: 640, nProcessed: 641
| stage: limitskip, plaNodeId: 8, totalExecutionTime: 0, nReturned: 0, nProcessed: 0
| | stage: seek, plaNodeId: 7, totalExecutionTime: 0, nReturned: 0, nProcessed: 0

ABT
nodeType: Root, plaNodeId: 10
| nodeType: BinaryJoin, plaNodeId: 9
| | nodeType: Filter, plaNodeId: 6
| | | nodeType: GroupBy, plaNodeId: 5
| | | | nodeType: Union, plaNodeId: 4
| | nodeType: LimitSkip, plaNodeId: 8
| | | nodeType: Seek, plaNodeId: 7


union_df.describe()


union_df.corr()


sns.scatterplot(x=union_df['n_processed'], y=union_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = union_df['execution_time']
X = union_df[['n_processed']]
X = sm.add_constant(X)
union_lm = sm.OLS(y, X).fit()
union_lm.summary()


y_pred = union_lm.predict(X)
sns.scatterplot(x=union_df['n_processed'], y=union_df['execution_time'])
sns.lineplot(x=union_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


exp.calibrate(union_df)

R2: 0.8980073052101615
Coefficients: [0.         0.00460031]


ls_df = abt_df[abt_df.abt_type == 'LimitSkip']
ls_df.head()


ls_df.describe()


ls_df.corr()


sns.scatterplot(x=ls_df['n_processed'], y=ls_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


sns.scatterplot(x=ls_df['n_returned'], y=ls_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>


y = ls_df['execution_time']
X = ls_df[['n_processed']]
X = sm.add_constant(X)
ls_lm = sm.OLS(y, X).fit()
ls_lm.summary()


y_pred = ls_lm.predict(X)
sns.scatterplot(x=ls_df['n_processed'], y=ls_df['execution_time'])
sns.lineplot(x=ls_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = ls_df['execution_time']
X = ls_df[['n_processed']]
X = sm.add_constant(X)
ls_glm = sm.GLM(y, X).fit_constrained(([1, 0], ls_df.execution_time.min()))
ls_glm.summary()


y_pred = ls_glm.predict(X)
sns.scatterplot(x=ls_df['n_processed'], y=ls_df['execution_time'])
sns.lineplot(x=ls_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


exp.calibrate(ls_df)

R2: 0.7931314002167056
Coefficients: [0.         0.09472212]


group_df = abt_df[abt_df.abt_type == 'GroupBy']
group_df.head()


exp.print_trees(df, group_df)

SBE
stage: nlj, plaNodeId: 9, totalExecutionTime: 1482, nReturned: 0, nProcessed: 0
| stage: filter, plaNodeId: 6, totalExecutionTime: 1479, nReturned: 0, nProcessed: 1252
| | stage: group, plaNodeId: 5, totalExecutionTime: 1292, nReturned: 1252, nProcessed: 1252
| | | stage: union, plaNodeId: 4, totalExecutionTime: 23, nReturned: 1252, nProcessed: 1252
| | | | stage: project, plaNodeId: 1, totalExecutionTime: 12, nReturned: 612, nProcessed: 612
| | | | | stage: ixseek, plaNodeId: 0, totalExecutionTime: 11, nReturned: 612, nProcessed: 613
| | | | stage: project, plaNodeId: 3, totalExecutionTime: 5, nReturned: 640, nProcessed: 640
| | | | | stage: ixseek, plaNodeId: 2, totalExecutionTime: 4, nReturned: 640, nProcessed: 641
| stage: limitskip, plaNodeId: 8, totalExecutionTime: 0, nReturned: 0, nProcessed: 0
| | stage: seek, plaNodeId: 7, totalExecutionTime: 0, nReturned: 0, nProcessed: 0

ABT
nodeType: Root, plaNodeId: 10
| nodeType: BinaryJoin, plaNodeId: 9
| | nodeType: Filter, plaNodeId: 6
| | | nodeType: GroupBy, plaNodeId: 5
| | | | nodeType: Union, plaNodeId: 4
| | nodeType: LimitSkip, plaNodeId: 8
| | | nodeType: Seek, plaNodeId: 7


group_df.describe()


group_df.corr()


sns.scatterplot(x=group_df['n_returned'], y=group_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>


sns.scatterplot(x=group_df['n_processed'], y=group_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = group_df['execution_time']
X = group_df[['n_processed']]
X = sm.add_constant(X)
group_lm = sm.OLS(y, X).fit()
group_lm.summary()


y_pred = group_lm.predict(X)
sns.scatterplot(x=group_df['n_processed'], y=group_df['execution_time'])
sns.lineplot(x=group_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = group_df['execution_time']
X = group_df[['n_processed']]
X = sm.add_constant(X)
group_glm = sm.GLM(y, X).fit_constrained(([1, 0], group_df.execution_time.min()))
group_glm.summary()


y_pred = group_glm.predict(X)
sns.scatterplot(x=group_df['n_processed'], y=group_df['execution_time'])
sns.lineplot(x=group_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


exp.calibrate(group_df)

R2: 0.9931070626694147
Coefficients: [0.         1.43812359]


eval_df = abt_df[abt_df.abt_type == 'Evaluation']
eval_df = eval_df[eval_df.number_of_fields > 0]
eval_df.head()


eval_df.number_of_fields.unique()

array([ 2, 10])


exp.print_trees(df, eval_df)

SBE
stage: project, plaNodeId: 2, totalExecutionTime: 681770, nReturned: 494877, nProcessed: 494877
| stage: filter, plaNodeId: 1, totalExecutionTime: 333419, nReturned: 494877, nProcessed: 1000000
| | stage: scan, plaNodeId: 0, totalExecutionTime: 2389, nReturned: 1000000, nProcessed: 1000000

ABT
nodeType: Root, plaNodeId: 3
| nodeType: Evaluation, plaNodeId: 2
| | nodeType: Filter, plaNodeId: 1
| | | nodeType: PhysicalScan, plaNodeId: 0


eval_df.describe()


eval_df.corr()


sns.scatterplot(x=eval_df['n_returned'], y=eval_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>


y = eval_df['execution_time']
X = eval_df[['n_processed', 'keys_length_in_bytes']]
X = sm.add_constant(X)
eval_lm = sm.OLS(y, X).fit()
eval_lm.summary()


y_pred = eval_lm.predict(X)
sns.scatterplot(x=eval_df['n_processed'], y=eval_df['execution_time'])
sns.lineplot(x=eval_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = eval_df['execution_time']
X = eval_df[['n_processed']]
X = sm.add_constant(X)
eval_glm = sm.GLM(y, X).fit_constrained(([1, 0], 0))
eval_glm.summary()


y_pred = eval_glm.predict(X)
sns.scatterplot(x=eval_df['n_processed'], y=eval_df['execution_time'])
sns.lineplot(x=eval_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


exp.calibrate(eval_df, ['n_returned', 'number_of_fields'])

R2: 0.0
Coefficients: [209681.42810457      0.              0.        ]


exp.calibrate(eval_df, ['n_returned'])

R2: -2.220446049250313e-16
Coefficients: [209681.42810457      0.        ]


eval_unwind_df = eval_df[eval_df.number_of_fields == 10]
eval_unwind_df.head()


exp.print_trees(df, eval_unwind_df)

SBE
stage: project, plaNodeId: 2, totalExecutionTime: 15251, nReturned: 4999248, nProcessed: 4999248
| stage: unwind, plaNodeId: 1, totalExecutionTime: 5255, nReturned: 4999248, nProcessed: 1000000
| | stage: scan, plaNodeId: 0, totalExecutionTime: 2682, nReturned: 1000000, nProcessed: 1000000

ABT
nodeType: Root, plaNodeId: 3
| nodeType: Evaluation, plaNodeId: 2
| | nodeType: Unwind, plaNodeId: 1
| | | nodeType: PhysicalScan, plaNodeId: 0


exp.print_explain(df, eval_unwind_df)

{
    "explainVersion": "2",
    "queryPlanner": {
        "namespace": "abt_calibration_big.c_arr_01_1000000",
        "indexFilterSet": false,
        "optimizedPipeline": true,
        "maxIndexedOrSolutionsReached": false,
        "maxIndexedAndSolutionsReached": false,
        "maxScansToExplodeReached": false,
        "winningPlan": {
            "optimizerPlan": {
                "nodeType": "Root",
                "properties": {
                    "cost": 20900.000002,
                    "localCost": 0.0,
                    "adjustedCE": 10000000.0,
                    "planNodeID": 3,
                    "logicalProperties": {
                        "cardinalityEstimate": [
                            {
                                "ce": 10000000.0
                            }
                        ],
                        "projections": [
                            "embedProj_0",
                            "scan_0",
                            "unwoundPid_0",
                            "unwoundProj_0"
                        ],
                        "collectionAvailability": [
                            "c_arr_01_1000000_fc452517-8329-417c-93b1-28e04369853c"
                        ]
                    }
                },
                "projections": [
                    "embedProj_0"
                ],
                "references": [
                    {
                        "nodeType": "Variable",
                        "name": "embedProj_0"
                    }
                ],
                "child": {
                    "nodeType": "Evaluation",
                    "properties": {
                        "cost": 20900.000002,
                        "localCost": 20000.000001,
                        "adjustedCE": 10000000.0,
                        "planNodeID": 2,
                        "logicalProperties": {
                            "cardinalityEstimate": [
                                {
                                    "ce": 10000000.0
                                }
                            ],
                            "projections": [
                                "embedProj_0",
                                "scan_0",
                                "unwoundPid_0",
                                "unwoundProj_0"
                            ],
                            "collectionAvailability": [
                                "c_arr_01_1000000_fc452517-8329-417c-93b1-28e04369853c"
                            ]
                        },
                        "physicalProperties": {
                            "projections": [
                                "embedProj_0"
                            ]
                        }
                    },
                    "projection": {
                        "embedProj_0": {
                            "nodeType": "If",
                            "condition": {
                                "nodeType": "BinaryOp",
                                "op": "Or",
                                "left": {
                                    "nodeType": "FunctionCall",
                                    "name": "exists",
                                    "arguments": [
                                        {
                                            "nodeType": "Variable",
                                            "name": "unwoundProj_0"
                                        }
                                    ]
                                },
                                "right": {
                                    "nodeType": "FunctionCall",
                                    "name": "isObject",
                                    "arguments": [
                                        {
                                            "nodeType": "Variable",
                                            "name": "scan_0"
                                        }
                                    ]
                                }
                            },
                            "then": {
                                "nodeType": "FunctionCall",
                                "name": "setField",
                                "arguments": [
                                    {
                                        "nodeType": "Variable",
                                        "name": "scan_0"
                                    },
                                    {
                                        "nodeType": "Const",
                                        "value": "as"
                                    },
                                    {
                                        "nodeType": "Variable",
                                        "name": "unwoundProj_0"
                                    }
                                ]
                            },
                            "else": {
                                "nodeType": "Variable",
                                "name": "scan_0"
                            }
                        }
                    },
                    "child": {
                        "nodeType": "Unwind",
                        "properties": {
                            "cost": 900.000001,
                            "localCost": 300.0,
                            "adjustedCE": 10000000.0,
                            "planNodeID": 1,
                            "logicalProperties": {
                                "cardinalityEstimate": [
                                    {
                                        "ce": 10000000.0
                                    }
                                ],
                                "projections": [
                                    "scan_0",
                                    "unwoundPid_0",
                                    "unwoundProj_0"
                                ],
                                "collectionAvailability": [
                                    "c_arr_01_1000000_fc452517-8329-417c-93b1-28e04369853c"
                                ]
                            },
                            "physicalProperties": {
                                "projections": [
                                    "scan_0",
                                    "unwoundProj_0"
                                ]
                            }
                        },
                        "retainNonArrays": false,
                        "bind": {
                            "unwoundPid_0": {
                                "nodeType": "Source"
                            },
                            "unwoundProj_0": {
                                "nodeType": "Source"
                            }
                        },
                        "child": {
                            "nodeType": "PhysicalScan",
                            "properties": {
                                "cost": 600.000001,
                                "localCost": 600.000001,
                                "adjustedCE": 1000000.0,
                                "planNodeID": 0,
                                "logicalProperties": {
                                    "cardinalityEstimate": [
                                        {
                                            "ce": 1000000.0
                                        },
                                        {
                                            "requirementCEs": [
                                                {
                                                    "refProjection": "scan_0",
                                                    "path": {
                                                        "nodeType": "PathGet",
                                                        "path": "as",
                                                        "input": {
                                                            "nodeType": "PathIdentity"
                                                        }
                                                    },
                                                    "ce": 1000000.0
                                                }
                                            ]
                                        }
                                    ],
                                    "projections": [
                                        "scan_0",
                                        "unwoundProj_0"
                                    ],
                                    "indexingAvailability": {
                                        "groupId": 0,
                                        "scanProjection": "scan_0",
                                        "scanDefName": "c_arr_01_1000000_fc452517-8329-417c-93b1-28e04369853c",
                                        "eqPredsOnly": false
                                    },
                                    "collectionAvailability": [
                                        "c_arr_01_1000000_fc452517-8329-417c-93b1-28e04369853c"
                                    ]
                                },
                                "physicalProperties": {
                                    "projections": [
                                        "scan_0",
                                        "unwoundProj_0"
                                    ],
                                    "indexingRequirement": {
                                        "target": "Complete",
                                        "dedupRID": true
                                    }
                                }
                            },
                            "fieldProjectionMap": {
                                "<root>": "scan_0",
                                "as": "unwoundProj_0"
                            },
                            "scanDefName": "c_arr_01_1000000_fc452517-8329-417c-93b1-28e04369853c",
                            "parallel": false,
                            "bindings": {
                                "scan_0": {
                                    "nodeType": "Source"
                                },
                                "unwoundProj_0": {
                                    "nodeType": "Source"
                                }
                            }
                        }
                    }
                }
            },
            "slotBasedPlan": {
                "slots": "$$RESULT=s5 env: {  }",
                "stages": "[2] project [s5 = \n    if (exists(s3) || isObject(s1)) \n    then setField(s1, \"as\", s3) \n    else s1 \n] \n[1] unwind s3 s4 s2 false \n[0] scan s1 none none none none none [s2 = as] @\"fc452517-8329-417c-93b1-28e04369853c\" true false "
            }
        },
        "rejectedPlans": []
    },
    "executionStats": {
        "executionSuccess": true,
        "nReturned": 4999248,
        "executionTimeMillis": 3213,
        "totalKeysExamined": 0,
        "totalDocsExamined": 1000000,
        "executionStages": {
            "stage": "project",
            "planNodeId": 2,
            "nReturned": 4999248,
            "executionTimeMillisEstimate": 15,
            "executionTimeMicros": 15251,
            "opens": 1,
            "closes": 1,
            "saveState": 0,
            "restoreState": 0,
            "isEOF": 1,
            "projections": {
                "5": "\n    if (exists(s3) || isObject(s1)) \n    then setField(s1, \"as\", s3) \n    else s1 \n"
            },
            "inputStage": {
                "stage": "unwind",
                "planNodeId": 1,
                "nReturned": 4999248,
                "executionTimeMillisEstimate": 5,
                "executionTimeMicros": 5255,
                "opens": 1,
                "closes": 1,
                "saveState": 0,
                "restoreState": 0,
                "isEOF": 1,
                "inputSlot": 2,
                "outSlot": 3,
                "outIndexSlot": 4,
                "preserveNullAndEmptyArrays": 0,
                "inputStage": {
                    "stage": "scan",
                    "planNodeId": 0,
                    "nReturned": 1000000,
                    "executionTimeMillisEstimate": 2,
                    "executionTimeMicros": 2682,
                    "opens": 1,
                    "closes": 1,
                    "saveState": 0,
                    "restoreState": 0,
                    "isEOF": 1,
                    "numReads": 1000000,
                    "recordSlot": 1,
                    "fields": [
                        "as"
                    ],
                    "outputSlots": [
                        2
                    ]
                }
            }
        }
    },
    "command": {
        "aggregate": "c_arr_01_1000000",
        "pipeline": [
            {
                "$unwind": "$as"
            }
        ],
        "cursor": {},
        "$db": "abt_calibration_big"
    },
    "serverInfo": {
        "host": "ip-10-122-6-29",
        "port": 27017,
        "version": "6.2.0-alpha",
        "gitVersion": "unknown"
    },
    "serverParameters": {
        "internalQueryFacetBufferSizeBytes": 104857600,
        "internalQueryFacetMaxOutputDocSizeBytes": 104857600,
        "internalLookupStageIntermediateDocumentMaxSizeBytes": 104857600,
        "internalDocumentSourceGroupMaxMemoryBytes": 104857600,
        "internalQueryMaxBlockingSortMemoryUsageBytes": 104857600,
        "internalQueryProhibitBlockingMergeOnMongoS": 0,
        "internalQueryMaxAddToSetBytes": 104857600,
        "internalDocumentSourceSetWindowFieldsMaxMemoryBytes": 104857600
    },
    "ok": 1.0
}


eval_unwind_df.corr()


sns.scatterplot(x=eval_unwind_df['n_returned'], y=eval_unwind_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>


exp.calibrate(eval_unwind_df, ['n_returned'])

R2: 0.8197887625374135
Coefficients: [0.         0.00144283]


eval_sum_df = eval_df[eval_df.number_of_fields == 2]
eval_sum_df.head()


exp.print_trees(df, eval_sum_df)

SBE
stage: project, plaNodeId: 2, totalExecutionTime: 681770, nReturned: 494877, nProcessed: 494877
| stage: filter, plaNodeId: 1, totalExecutionTime: 333419, nReturned: 494877, nProcessed: 1000000
| | stage: scan, plaNodeId: 0, totalExecutionTime: 2389, nReturned: 1000000, nProcessed: 1000000

ABT
nodeType: Root, plaNodeId: 3
| nodeType: Evaluation, plaNodeId: 2
| | nodeType: Filter, plaNodeId: 1
| | | nodeType: PhysicalScan, plaNodeId: 0


exp.print_explain(df, eval_sum_df)

{
    "explainVersion": "2",
    "queryPlanner": {
        "namespace": "abt_calibration_big.c_int_05_1000000",
        "indexFilterSet": false,
        "optimizedPipeline": true,
        "maxIndexedOrSolutionsReached": false,
        "maxIndexedAndSolutionsReached": false,
        "maxScansToExplodeReached": false,
        "winningPlan": {
            "optimizerPlan": {
                "nodeType": "Root",
                "properties": {
                    "cost": 1762.000003,
                    "localCost": 0.0,
                    "adjustedCE": 481000.0,
                    "planNodeID": 3,
                    "logicalProperties": {
                        "cardinalityEstimate": [
                            {
                                "ce": 481000.0
                            }
                        ],
                        "projections": [
                            "combinedProjection_0",
                            "scan_0"
                        ],
                        "indexingAvailability": {
                            "groupId": 0,
                            "scanProjection": "scan_0",
                            "scanDefName": "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192",
                            "eqPredsOnly": false
                        },
                        "collectionAvailability": [
                            "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192"
                        ]
                    },
                    "physicalProperties": {
                        "indexingRequirement": {
                            "target": "Complete",
                            "dedupRID": true
                        }
                    }
                },
                "projections": [
                    "combinedProjection_0"
                ],
                "references": [
                    {
                        "nodeType": "Variable",
                        "name": "combinedProjection_0"
                    }
                ],
                "child": {
                    "nodeType": "Evaluation",
                    "properties": {
                        "cost": 1762.000003,
                        "localCost": 962.000001,
                        "adjustedCE": 481000.0,
                        "planNodeID": 2,
                        "logicalProperties": {
                            "cardinalityEstimate": [
                                {
                                    "ce": 481000.0
                                }
                            ],
                            "projections": [
                                "combinedProjection_0",
                                "scan_0"
                            ],
                            "indexingAvailability": {
                                "groupId": 0,
                                "scanProjection": "scan_0",
                                "scanDefName": "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192",
                                "eqPredsOnly": false
                            },
                            "collectionAvailability": [
                                "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192"
                            ]
                        },
                        "physicalProperties": {
                            "projections": [
                                "combinedProjection_0"
                            ],
                            "indexingRequirement": {
                                "target": "Complete",
                                "dedupRID": true
                            }
                        }
                    },
                    "projection": {
                        "combinedProjection_0": {
                            "nodeType": "Let",
                            "variable": "valDefault_0",
                            "bind": {
                                "nodeType": "Let",
                                "variable": "inputField_0",
                                "bind": {
                                    "nodeType": "If",
                                    "condition": {
                                        "nodeType": "FunctionCall",
                                        "name": "isObject",
                                        "arguments": [
                                            {
                                                "nodeType": "Variable",
                                                "name": "scan_0"
                                            }
                                        ]
                                    },
                                    "then": {
                                        "nodeType": "FunctionCall",
                                        "name": "keepFields",
                                        "arguments": [
                                            {
                                                "nodeType": "Variable",
                                                "name": "scan_0"
                                            },
                                            {
                                                "nodeType": "Const",
                                                "value": "_id"
                                            },
                                            {
                                                "nodeType": "Const",
                                                "value": "in3"
                                            }
                                        ]
                                    },
                                    "else": {
                                        "nodeType": "Variable",
                                        "name": "scan_0"
                                    }
                                },
                                "expression": {
                                    "nodeType": "Let",
                                    "variable": "valField_0",
                                    "bind": {
                                        "nodeType": "BinaryOp",
                                        "op": "Add",
                                        "left": {
                                            "nodeType": "FunctionCall",
                                            "name": "getField",
                                            "arguments": [
                                                {
                                                    "nodeType": "Variable",
                                                    "name": "scan_0"
                                                },
                                                {
                                                    "nodeType": "Const",
                                                    "value": "in1"
                                                }
                                            ]
                                        },
                                        "right": {
                                            "nodeType": "FunctionCall",
                                            "name": "getField",
                                            "arguments": [
                                                {
                                                    "nodeType": "Variable",
                                                    "name": "scan_0"
                                                },
                                                {
                                                    "nodeType": "Const",
                                                    "value": "in2"
                                                }
                                            ]
                                        }
                                    },
                                    "expression": {
                                        "nodeType": "If",
                                        "condition": {
                                            "nodeType": "BinaryOp",
                                            "op": "Or",
                                            "left": {
                                                "nodeType": "FunctionCall",
                                                "name": "exists",
                                                "arguments": [
                                                    {
                                                        "nodeType": "Variable",
                                                        "name": "valField_0"
                                                    }
                                                ]
                                            },
                                            "right": {
                                                "nodeType": "FunctionCall",
                                                "name": "isObject",
                                                "arguments": [
                                                    {
                                                        "nodeType": "Variable",
                                                        "name": "inputField_0"
                                                    }
                                                ]
                                            }
                                        },
                                        "then": {
                                            "nodeType": "FunctionCall",
                                            "name": "setField",
                                            "arguments": [
                                                {
                                                    "nodeType": "Variable",
                                                    "name": "inputField_0"
                                                },
                                                {
                                                    "nodeType": "Const",
                                                    "value": "in3"
                                                },
                                                {
                                                    "nodeType": "Variable",
                                                    "name": "valField_0"
                                                }
                                            ]
                                        },
                                        "else": {
                                            "nodeType": "Variable",
                                            "name": "inputField_0"
                                        }
                                    }
                                }
                            },
                            "expression": {
                                "nodeType": "If",
                                "condition": {
                                    "nodeType": "FunctionCall",
                                    "name": "exists",
                                    "arguments": [
                                        {
                                            "nodeType": "Variable",
                                            "name": "valDefault_0"
                                        }
                                    ]
                                },
                                "then": {
                                    "nodeType": "Variable",
                                    "name": "valDefault_0"
                                },
                                "else": {
                                    "nodeType": "Const",
                                    "value": {}
                                }
                            }
                        }
                    },
                    "child": {
                        "nodeType": "Filter",
                        "properties": {
                            "cost": 800.000002,
                            "localCost": 800.000002,
                            "adjustedCE": 1000000.0,
                            "planNodeID": 1,
                            "logicalProperties": {
                                "cardinalityEstimate": [
                                    {
                                        "ce": 481000.0
                                    },
                                    {
                                        "requirementCEs": [
                                            {
                                                "refProjection": "scan_0",
                                                "path": {
                                                    "nodeType": "PathGet",
                                                    "path": "in1",
                                                    "input": {
                                                        "nodeType": "PathIdentity"
                                                    }
                                                },
                                                "ce": 481000.0
                                            }
                                        ]
                                    }
                                ],
                                "projections": [
                                    "scan_0"
                                ],
                                "indexingAvailability": {
                                    "groupId": 0,
                                    "scanProjection": "scan_0",
                                    "scanDefName": "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192",
                                    "eqPredsOnly": false
                                },
                                "collectionAvailability": [
                                    "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192"
                                ]
                            },
                            "physicalProperties": {
                                "projections": [
                                    "scan_0"
                                ],
                                "indexingRequirement": {
                                    "target": "Complete",
                                    "dedupRID": true
                                }
                            }
                        },
                        "filter": {
                            "nodeType": "If",
                            "condition": {
                                "nodeType": "FunctionCall",
                                "name": "fillEmpty",
                                "arguments": [
                                    {
                                        "nodeType": "BinaryOp",
                                        "op": "Gt",
                                        "left": {
                                            "nodeType": "BinaryOp",
                                            "op": "Cmp3w",
                                            "left": {
                                                "nodeType": "Variable",
                                                "name": "evalTemp_0"
                                            },
                                            "right": {
                                                "nodeType": "Const",
                                                "value": 500
                                            }
                                        },
                                        "right": {
                                            "nodeType": "Const",
                                            "value": 0
                                        }
                                    },
                                    {
                                        "nodeType": "Const",
                                        "value": false
                                    }
                                ]
                            },
                            "then": {
                                "nodeType": "BinaryOp",
                                "op": "Lt",
                                "left": {
                                    "nodeType": "BinaryOp",
                                    "op": "Cmp3w",
                                    "left": {
                                        "nodeType": "Variable",
                                        "name": "evalTemp_0"
                                    },
                                    "right": {
                                        "nodeType": "Const",
                                        "value": ""
                                    }
                                },
                                "right": {
                                    "nodeType": "Const",
                                    "value": 0
                                }
                            },
                            "else": {
                                "nodeType": "Const",
                                "value": false
                            }
                        },
                        "child": {
                            "nodeType": "PhysicalScan",
                            "properties": {
                                "cost": 800.000002,
                                "localCost": 800.000002,
                                "adjustedCE": 1000000.0,
                                "planNodeID": 0,
                                "logicalProperties": {
                                    "cardinalityEstimate": [
                                        {
                                            "ce": 481000.0
                                        },
                                        {
                                            "requirementCEs": [
                                                {
                                                    "refProjection": "scan_0",
                                                    "path": {
                                                        "nodeType": "PathGet",
                                                        "path": "in1",
                                                        "input": {
                                                            "nodeType": "PathIdentity"
                                                        }
                                                    },
                                                    "ce": 481000.0
                                                }
                                            ]
                                        }
                                    ],
                                    "projections": [
                                        "scan_0"
                                    ],
                                    "indexingAvailability": {
                                        "groupId": 0,
                                        "scanProjection": "scan_0",
                                        "scanDefName": "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192",
                                        "eqPredsOnly": false
                                    },
                                    "collectionAvailability": [
                                        "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192"
                                    ]
                                },
                                "physicalProperties": {
                                    "projections": [
                                        "scan_0"
                                    ],
                                    "indexingRequirement": {
                                        "target": "Complete",
                                        "dedupRID": true
                                    }
                                }
                            },
                            "fieldProjectionMap": {
                                "<root>": "scan_0",
                                "in1": "evalTemp_0"
                            },
                            "scanDefName": "c_int_05_1000000_50cdb415-9a4f-45c8-a6bf-4cf991418192",
                            "parallel": false,
                            "bindings": {
                                "evalTemp_0": {
                                    "nodeType": "Source"
                                },
                                "scan_0": {
                                    "nodeType": "Source"
                                }
                            }
                        }
                    }
                }
            },
            "slotBasedPlan": {
                "slots": "$$RESULT=s3 env: {  }",
                "stages": "[2] project [s3 = \n    let [\n        l101.0 = \n            let [\n                l102.0 = \n                    if isObject(s1) \n                    then keepFields(s1, \"_id\", \"in3\") \n                    else s1 \n            ] \n            in \n                let [\n                    l103.0 = (getField(s1, \"in1\") + getField(s1, \"in2\")) \n                ] \n                in \n                    if (exists(l103.0) || isObject(l102.0)) \n                    then setField(l102.0, \"in3\", l103.0) \n                    else l102.0 \n    ] \n    in \n        if exists(l101.0) \n        then l101.0 \n        else {} \n] \n[1] filter {\n    if fillEmpty(((s2 <=> 500) > 0), false) \n    then ((s2 <=> \"\") < 0) \n    else false \n} \n[0] scan s1 none none none none none [s2 = in1] @\"50cdb415-9a4f-45c8-a6bf-4cf991418192\" true false "
            }
        },
        "rejectedPlans": []
    },
    "executionStats": {
        "executionSuccess": true,
        "nReturned": 494877,
        "executionTimeMillis": 1054,
        "totalKeysExamined": 0,
        "totalDocsExamined": 1000000,
        "executionStages": {
            "stage": "project",
            "planNodeId": 2,
            "nReturned": 494877,
            "executionTimeMillisEstimate": 681,
            "executionTimeMicros": 681770,
            "opens": 1,
            "closes": 1,
            "saveState": 0,
            "restoreState": 0,
            "isEOF": 1,
            "projections": {
                "3": "\n    let [\n        l101.0 = \n            let [\n                l102.0 = \n                    if isObject(s1) \n                    then keepFields(s1, \"_id\", \"in3\") \n                    else s1 \n            ] \n            in \n                let [\n                    l103.0 = (getField(s1, \"in1\") + getField(s1, \"in2\")) \n                ] \n                in \n                    if (exists(l103.0) || isObject(l102.0)) \n                    then setField(l102.0, \"in3\", l103.0) \n                    else l102.0 \n    ] \n    in \n        if exists(l101.0) \n        then l101.0 \n        else {} \n"
            },
            "inputStage": {
                "stage": "filter",
                "planNodeId": 1,
                "nReturned": 494877,
                "executionTimeMillisEstimate": 333,
                "executionTimeMicros": 333419,
                "opens": 1,
                "closes": 1,
                "saveState": 0,
                "restoreState": 0,
                "isEOF": 1,
                "numTested": 1000000,
                "filter": "\n    if fillEmpty(((s2 <=> 500) > 0), false) \n    then ((s2 <=> \"\") < 0) \n    else false \n",
                "inputStage": {
                    "stage": "scan",
                    "planNodeId": 0,
                    "nReturned": 1000000,
                    "executionTimeMillisEstimate": 2,
                    "executionTimeMicros": 2389,
                    "opens": 1,
                    "closes": 1,
                    "saveState": 0,
                    "restoreState": 0,
                    "isEOF": 1,
                    "numReads": 1000000,
                    "recordSlot": 1,
                    "fields": [
                        "in1"
                    ],
                    "outputSlots": [
                        2
                    ]
                }
            }
        }
    },
    "command": {
        "aggregate": "c_int_05_1000000",
        "pipeline": [
            {
                "$match": {
                    "in1": {
                        "$gt": 500
                    }
                }
            },
            {
                "$project": {
                    "in3": {
                        "$add": [
                            "$in1",
                            "$in2"
                        ]
                    }
                }
            }
        ],
        "cursor": {},
        "$db": "abt_calibration_big"
    },
    "serverInfo": {
        "host": "ip-10-122-6-29",
        "port": 27017,
        "version": "6.2.0-alpha",
        "gitVersion": "unknown"
    },
    "serverParameters": {
        "internalQueryFacetBufferSizeBytes": 104857600,
        "internalQueryFacetMaxOutputDocSizeBytes": 104857600,
        "internalLookupStageIntermediateDocumentMaxSizeBytes": 104857600,
        "internalDocumentSourceGroupMaxMemoryBytes": 104857600,
        "internalQueryMaxBlockingSortMemoryUsageBytes": 104857600,
        "internalQueryProhibitBlockingMergeOnMongoS": 0,
        "internalQueryMaxAddToSetBytes": 104857600,
        "internalDocumentSourceSetWindowFieldsMaxMemoryBytes": 104857600
    },
    "ok": 1.0
}


eval_sum_df.corr()


sns.scatterplot(x=eval_sum_df['n_returned'], y=eval_sum_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>


exp.calibrate(eval_sum_df)

R2: 0.9912578829453941
Coefficients: [2.75194138e+04 7.07673319e-01]


unwind_df = abt_df[abt_df.abt_type == 'Unwind']
unwind_df.head()


unwind_df.describe()


unwind_df.corr()


sns.scatterplot(x=unwind_df['n_returned'], y=unwind_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>


sns.scatterplot(x=unwind_df['n_processed'], y=unwind_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = unwind_df['execution_time']
X = unwind_df[['n_returned']]
X = sm.add_constant(X)
unwind_lm = sm.OLS(y, X).fit()
unwind_lm.summary()


y_pred = unwind_lm.predict(X)
sns.scatterplot(x=unwind_df['n_processed'], y=unwind_df['execution_time'])
sns.lineplot(x=unwind_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


exp.calibrate(unwind_df)

R2: 0.7361659234382227
Coefficients: [4.13941176e+01 2.06492702e-03]


scan_df = abt_df[abt_df.abt_type == 'PhysicalScan']
scan_df.head()


scan_df.describe()


scan_df.corr()


sns.scatterplot(x=scan_df['n_processed'], y=scan_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


sns.scatterplot(x=scan_df['average_document_size_in_bytes'], y=scan_df['execution_time'], color='blue')

<AxesSubplot:xlabel='average_document_size_in_bytes', ylabel='execution_time'>


y = scan_df['execution_time']
X = scan_df[['n_returned', 'average_document_size_in_bytes']]
X = sm.add_constant(X)
scan_lm = sm.OLS(y, X).fit()
scan_lm.summary()


y_pred = scan_lm.predict(X)
sns.scatterplot(x=scan_df['n_processed'], y=scan_df['execution_time'])
sns.lineplot(x=scan_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = scan_df['execution_time']
X = scan_df[['n_processed', 'average_document_size_in_bytes']]
X = sm.add_constant(X)
scan_lm = sm.OLS(y, X).fit()
scan_lm.summary()


y_pred = scan_lm.predict(X)
sns.scatterplot(x=scan_df['n_processed'], y=scan_df['execution_time'])
sns.lineplot(x=scan_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = scan_df['execution_time']
X = scan_df[['n_processed', 'average_document_size_in_bytes']]
X = sm.add_constant(X)
scan_glm = sm.GLM(y, X).fit_constrained(([1, 0, 0], 0))
scan_glm.summary()


y_pred = scan_glm.predict(X)
sns.scatterplot(x=scan_df['n_processed'], y=scan_df['execution_time'])
sns.lineplot(x=scan_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = scan_df['execution_time']
X = scan_df[['n_processed']]
X = sm.add_constant(X)
scan_glm = sm.GLM(y, X).fit_constrained(([1, 0], 0))
scan_glm.summary()


y_pred = scan_glm.predict(X)
sns.scatterplot(x=scan_df['n_processed'], y=scan_df['execution_time'])
sns.lineplot(x=scan_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


exp.calibrate(scan_df)

R2: 0.6334014132570354
Coefficients: [0.         0.00226165]


ixscan_df = abt_df[abt_df.abt_type == 'IndexScan']
ixscan_df.head()


ixscan_df.describe()


ixscan_df.corr()


sns.scatterplot(x=ixscan_df['n_processed'], y=ixscan_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


sns.scatterplot(x=ixscan_df['n_returned'], y=ixscan_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_returned', ylabel='execution_time'>


y = ixscan_df['execution_time']
X = ixscan_df[['n_processed']]
X = sm.add_constant(X)
ixscan_lm = sm.OLS(y, X).fit()
ixscan_lm.summary()


y_pred = ixscan_lm.predict(X)
sns.scatterplot(x=ixscan_df['n_processed'], y=ixscan_df['execution_time'])
sns.lineplot(x=ixscan_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


exp.calibrate(ixscan_df)

R2: 0.897909337334287
Coefficients: [0.         0.00334613]


seek_df = abt_df[abt_df.abt_type == 'Seek']
seek_df.head()


seek_df.describe()


seek_df.corr()


sns.scatterplot(x=seek_df['n_processed'], y=seek_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = seek_df['execution_time']
X = seek_df[['n_processed']]
X = sm.add_constant(X)
seek_lm = sm.OLS(y, X).fit()
seek_lm.summary()


y_pred = seek_lm.predict(X)
sns.scatterplot(x=seek_df['n_processed'], y=seek_df['execution_time'])
sns.lineplot(x=seek_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = seek_df['execution_time']
X = seek_df[['n_processed']]
X = sm.add_constant(X)
seek_glm = sm.GLM(y, X).fit_constrained(([1, 0], 0))
seek_glm.summary()


y_pred = seek_glm.predict(X)
sns.scatterplot(x=seek_df['n_processed'], y=seek_df['execution_time'])
sns.lineplot(x=seek_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


exp.calibrate(seek_df)

R2: 0.6003597139964046
Coefficients: [0.         0.50840207]


filter_df = abt_df[abt_df.abt_type == 'Filter']
filter_df.head()


filter_df.describe()


filter_df.corr()


sns.scatterplot(x=filter_df['n_processed'], y=filter_df['execution_time'], color='blue')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


sns.scatterplot(x=filter_df['keys_length_in_bytes'], y=filter_df['execution_time'], color='blue')

<AxesSubplot:xlabel='keys_length_in_bytes', ylabel='execution_time'>


y = filter_df['execution_time']
X = filter_df[['n_processed', 'keys_length_in_bytes']]
X = sm.add_constant(X)
filter_lm = sm.OLS(y, X).fit()
filter_lm.summary()


y = filter_df['execution_time']
X = filter_df[['n_processed']]
X = sm.add_constant(X)
filter_lm = sm.OLS(y, X).fit()
filter_lm.summary()


y = filter_df['execution_time']
X = filter_df[['n_processed']]
X = sm.add_constant(X)
filter_glm = sm.GLM(y, X).fit_constrained(([1, 0], 0))
filter_glm.summary()


y_pred = filter_glm.predict(X)
sns.scatterplot(x=filter_df['n_processed'], y=filter_df['execution_time'])
sns.lineplot(x=filter_df['n_processed'],y=y_pred, color='red')

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


exp.calibrate(filter_df)

R2: 0.9830974658266592
Coefficients: [0.        0.4724306]

	total_execution_time
count	1.564000e+04
mean	6.617052e+05
std	1.058255e+06
min	1.000000e+01
25%	7.275500e+03
50%	6.583500e+04
75%	1.059969e+06
max	7.024473e+06

	total_execution_time
count	1.411400e+04
mean	6.467882e+05
std	1.036968e+06
min	1.000000e+01
25%	6.824250e+03
50%	6.364950e+04
75%	1.021397e+06
max	7.015423e+06

	abt_type	execution_time	n_processed	keys_length_in_bytes	average_document_size_in_bytes	run_id	source
18365	MergeJoin	633	1252	1	110	6336451e7919d20f9deafd8e	5440
18390	MergeJoin	617	1252	1	110	6336451e7919d20f9deafd8e	5444
18397	MergeJoin	616	1252	1	110	6336451e7919d20f9deafd8e	5445
18410	MergeJoin	629	1252	1	110	6336451e7919d20f9deafd8e	5448
18417	MergeJoin	623	1252	1	110	6336451e7919d20f9deafd8e	5449

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	number_of_fields	source
count	1726.000000	1726.000000	1726.000000	1726.0	1726.0	1726.0	1726.000000
mean	9322.962920	37.786211	19308.841251	1.0	110.0	0.0	8868.545771
std	6058.184291	34.792521	12577.694721	0.0	0.0	0.0	1951.001476
min	176.000000	0.000000	352.000000	1.0	110.0	0.0	5440.000000
25%	4273.250000	10.000000	8853.000000	1.0	110.0	0.0	7217.500000
50%	8177.000000	24.000000	16801.000000	1.0	110.0	0.0	8880.500000
75%	13567.250000	62.750000	28204.250000	1.0	110.0	0.0	10497.500000
max	24875.000000	147.000000	52044.000000	1.0	110.0	0.0	12178.000000

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	number_of_fields	source
execution_time	1.000000	0.951772	0.999881	NaN	NaN	NaN	0.618293
n_returned	0.951772	1.000000	0.951941	NaN	NaN	NaN	0.445914
n_processed	0.999881	0.951941	1.000000	NaN	NaN	NaN	0.617830
keys_length_in_bytes	NaN	NaN	NaN	NaN	NaN	NaN	NaN
average_document_size_in_bytes	NaN	NaN	NaN	NaN	NaN	NaN	NaN
number_of_fields	NaN	NaN	NaN	NaN	NaN	NaN	NaN
source	0.618293	0.445914	0.617830	NaN	NaN	NaN	1.000000

MergeJoin¶

BinaryJoin¶

HashJoin¶

Union¶

LimitSkip¶

GroupBy¶

Evalution¶

Evaluation Unwind¶

Evaluation Sum¶

Unwind¶

PhysicalScan¶

IndexScan¶

Seek¶

Filter¶

	abt_type	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	run_id
0	LimitSkip	2209	59685	59685	3	42	633624617919d20f9deae62e
1	BinaryJoin	58384	59685	59685	3	42	633624617919d20f9deae62e
2	IndexScan	157	59685	59686	3	42	633624617919d20f9deae62e
3	Seek	2906	59685	59685	3	42	633624617919d20f9deae62e
4	Root	0	59685	0	3	42	633624617919d20f9deae62e

Dep. Variable:	execution_time	R-squared:	1.000
Model:	OLS	Adj. R-squared:	1.000
Method:	Least Squares	F-statistic:	7.237e+06
Date:	Fri, 30 Sep 2022	Prob (F-statistic):	0.00
Time:	10:57:14	Log-Likelihood:	-10281.
No. Observations:	1726	AIC:	2.057e+04
Df Residuals:	1724	BIC:	2.058e+04
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	23.7559	4.125	5.759	0.000	15.665	31.847
n_processed	0.4816	0.000	2690.101	0.000	0.481	0.482

Omnibus:	106.850	Durbin-Watson:	0.356
Prob(Omnibus):	0.000	Jarque-Bera (JB):	265.553
Skew:	0.349	Prob(JB):	2.17e-58
Kurtosis:	4.790	Cond. No.	4.22e+04

Omnibus:	107.983	Durbin-Watson:	0.356
Prob(Omnibus):	0.000	Jarque-Bera (JB):	265.769
Skew:	0.357	Prob(JB):	1.95e-58
Kurtosis:	4.785	Cond. No.	5.08e+04

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	number_of_fields	source
count	1.012700e+04	1.012700e+04	1.012700e+04	10127.000000	10127.000000	10127.000000	10127.000000
mean	4.756040e+04	4.818516e+04	4.859375e+04	2.178434	108.670485	0.119285	8783.434976
std	1.276394e+05	1.299854e+05	1.298404e+05	2.882973	18.483604	0.473670	3632.120204
min	0.000000e+00	0.000000e+00	0.000000e+00	1.000000	42.000000	0.000000	0.000000
25%	6.800000e+01	2.000000e+00	6.400000e+01	1.000000	110.000000	0.000000	6209.500000
50%	1.222000e+03	7.500000e+01	9.580000e+02	1.000000	110.000000	0.000000	9009.000000
75%	1.060500e+04	1.139900e+04	1.171700e+04	1.000000	110.000000	0.000000	11801.500000
max	1.264509e+06	1.268990e+06	1.268990e+06	12.000000	133.000000	2.000000	14788.000000

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	number_of_fields	source
execution_time	1.000000	0.999830	0.999918	0.272291	-0.068941	0.446514	-0.139822
n_returned	0.999830	1.000000	0.999955	0.278109	-0.067185	0.443475	-0.145968
n_processed	0.999918	0.999955	1.000000	0.277134	-0.067034	0.443178	-0.144273
keys_length_in_bytes	0.272291	0.278109	0.277134	1.000000	-0.132785	-0.102948	-0.630184
average_document_size_in_bytes	-0.068941	-0.067185	-0.067034	-0.132785	1.000000	0.018116	0.260180
number_of_fields	0.446514	0.443475	0.443178	-0.102948	0.018116	1.000000	0.387364
source	-0.139822	-0.145968	-0.144273	-0.630184	0.260180	0.387364	1.000000

Omnibus:	3542.682	Durbin-Watson:	0.428
Prob(Omnibus):	0.000	Jarque-Bera (JB):	844656.937
Skew:	-0.421	Prob(JB):	0.00
Kurtosis:	47.733	Cond. No.	2.22e+05

Omnibus:	6057.968	Durbin-Watson:	0.979
Prob(Omnibus):	0.000	Jarque-Bera (JB):	202589.477
Skew:	2.315	Prob(JB):	0.00
Kurtosis:	24.417	Cond. No.	1.48e+05

Omnibus:	3037.117	Durbin-Watson:	0.497
Prob(Omnibus):	0.000	Jarque-Bera (JB):	504315.454
Skew:	-0.099	Prob(JB):	0.00
Kurtosis:	37.571	Cond. No.	1.48e+05

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	22.4286	4.964	4.518	0.000	12.692	32.165
n_processed	0.4819	0.001	824.206	0.000	0.481	0.483
n_returned	-0.1016	0.211	-0.481	0.631	-0.516	0.313

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-421.7009	17.160	-24.575	0.000	-455.337	-388.065
n_processed	1.4524	0.012	118.113	0.000	1.428	1.476
n_returned	-0.4689	0.012	-38.175	0.000	-0.493	-0.445

	abt_type	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	run_id	source
18764	HashJoin	2812	0	4741	1	110	6336451f7919d20f9deafd95	5510
18807	HashJoin	2811	0	4741	1	110	6336451f7919d20f9deafd95	5518
18958	HashJoin	19862	22	27150	1	110	633645207919d20f9deafd98	5545
18965	HashJoin	19837	22	27150	1	110	633645207919d20f9deafd98	5546
18996	HashJoin	17777	20	25612	1	110	633645207919d20f9deafd99	5551

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-1.088e+05	5921.432	-18.373	0.000	-1.2e+05	-9.72e+04
n_processed	1.3322	0.004	331.593	0.000	1.324	1.340
n_returned	-8.1563	1.246	-6.548	0.000	-10.599	-5.713

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	number_of_fields	source
count	1.834000e+03	1834.000000	1.834000e+03	1834.0	1834.0	1834.0	1834.000000
mean	1.858712e+06	4577.565431	1.504955e+06	1.0	110.0	0.0	8876.460742
std	1.665740e+06	4069.831101	1.261856e+06	0.0	0.0	0.0	1969.201490
min	2.811000e+03	0.000000	4.741000e+03	1.0	110.0	0.0	5510.000000
25%	4.188332e+05	950.000000	4.108400e+05	1.0	110.0	0.0	7198.250000
50%	1.389704e+06	3552.000000	1.149390e+06	1.0	110.0	0.0	8870.500000
75%	2.934652e+06	7177.000000	2.374488e+06	1.0	110.0	0.0	10545.750000
max	6.931686e+06	15732.000000	4.978615e+06	1.0	110.0	0.0	12232.000000

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	number_of_fields	source
execution_time	1.000000	0.675287	0.995430	NaN	NaN	NaN	0.527774
n_returned	0.675287	1.000000	0.688905	NaN	NaN	NaN	0.473347
n_processed	0.995430	0.688905	1.000000	NaN	NaN	NaN	0.519010
keys_length_in_bytes	NaN	NaN	NaN	NaN	NaN	NaN	NaN
average_document_size_in_bytes	NaN	NaN	NaN	NaN	NaN	NaN	NaN
number_of_fields	NaN	NaN	NaN	NaN	NaN	NaN	NaN
source	0.527774	0.473347	0.519010	NaN	NaN	NaN	1.000000

Omnibus:	398.945	Durbin-Watson:	0.148
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1459.629
Skew:	1.033	Prob(JB):	0.00
Kurtosis:	6.851	Cond. No.	3.17e+06

Omnibus:	368.827	Durbin-Watson:	0.143
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1094.659
Skew:	1.022	Prob(JB):	1.98e-238
Kurtosis:	6.185	Cond. No.	3.06e+06

	abt_type	execution_time	n_processed	keys_length_in_bytes	average_document_size_in_bytes	run_id	source
58868	Union	6	1252	1	110	63365d777919d20f9deb1ac6	12242
58911	Union	6	1252	1	110	63365d777919d20f9deb1ac6	12249
58984	Union	6	1577	1	110	63365d777919d20f9deb1ac8	12261
58991	Union	4	1577	1	110	63365d777919d20f9deb1ac8	12262
58998	Union	13	1577	1	110	63365d777919d20f9deb1ac8	12264

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	number_of_fields	source
count	799.000000	799.000000	799.000000	799.0	799.0	799.0	799.000000
mean	83.887359	38.868586	19714.630788	1.0	110.0	0.0	13081.634543
std	71.748585	34.823026	12736.668332	0.0	0.0	0.0	493.081987
min	1.000000	0.000000	813.000000	1.0	110.0	0.0	12242.000000
25%	18.500000	11.000000	8938.000000	1.0	110.0	0.0	12672.500000
50%	71.000000	27.000000	17739.000000	1.0	110.0	0.0	13086.000000
75%	134.500000	61.000000	29181.000000	1.0	110.0	0.0	13503.000000
max	360.000000	147.000000	52044.000000	1.0	110.0	0.0	13925.000000

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	number_of_fields	source
execution_time	1.000000	0.920163	0.963655	NaN	NaN	NaN	0.656127
n_returned	0.920163	1.000000	0.946314	NaN	NaN	NaN	0.483939
n_processed	0.963655	0.946314	1.000000	NaN	NaN	NaN	0.670496
keys_length_in_bytes	NaN	NaN	NaN	NaN	NaN	NaN	NaN
average_document_size_in_bytes	NaN	NaN	NaN	NaN	NaN	NaN	NaN
number_of_fields	NaN	NaN	NaN	NaN	NaN	NaN	NaN
source	0.656127	0.483939	0.670496	NaN	NaN	NaN	1.000000

Omnibus:	172.558	Durbin-Watson:	1.587
Prob(Omnibus):	0.000	Jarque-Bera (JB):	581.131
Skew:	1.014	Prob(JB):	6.44e-127
Kurtosis:	6.653	Cond. No.	4.33e+04

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	number_of_fields	source
execution_time	1.000000	0.892805	0.893080	0.109001	0.032698	0.545629	0.083271
n_returned	0.892805	1.000000	0.999955	0.278109	-0.067185	0.443475	-0.145968
n_processed	0.893080	0.999955	1.000000	0.277134	-0.067034	0.443178	-0.144273
keys_length_in_bytes	0.109001	0.278109	0.277134	1.000000	-0.132785	-0.102948	-0.630184
average_document_size_in_bytes	0.032698	-0.067185	-0.067034	-0.132785	1.000000	0.018116	0.260180
number_of_fields	0.545629	0.443475	0.443178	-0.102948	0.018116	1.000000	0.387364
source	0.083271	-0.145968	-0.144273	-0.630184	0.260180	0.387364	1.000000

Omnibus:	2707.668	Durbin-Watson:	0.132
Prob(Omnibus):	0.000	Jarque-Bera (JB):	184892.821
Skew:	0.342	Prob(JB):	0.00
Kurtosis:	23.922	Cond. No.	1.48e+05

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	-12.0000	0	-inf	0.000	-12.000	-12.000
n_processed	0.0948	0.000	205.562	0.000	0.094	0.096

	abt_type	execution_time	n_processed	keys_length_in_bytes	average_document_size_in_bytes	run_id	source
58871	GroupBy	1269	1252	1	110	63365d777919d20f9deb1ac6	12242
58914	GroupBy	1260	1252	1	110	63365d777919d20f9deb1ac6	12249
58987	GroupBy	1586	1577	1	110	63365d777919d20f9deb1ac8	12261
58994	GroupBy	1607	1577	1	110	63365d777919d20f9deb1ac8	12262
59001	GroupBy	1575	1577	1	110	63365d777919d20f9deb1ac8	12264

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	number_of_fields	source
count	810.000000	810.000000	810.000000	810.0	810.0	810.0	810.000000
mean	27328.232099	38.517284	19519.030864	1.0	110.0	0.0	13076.240741
std	19411.364976	34.729259	12689.273304	0.0	0.0	0.0	495.434536
min	881.000000	0.000000	813.000000	1.0	110.0	0.0	12242.000000
25%	10905.000000	10.000000	8925.000000	1.0	110.0	0.0	12641.750000
50%	23979.000000	25.000000	17295.000000	1.0	110.0	0.0	13082.000000
75%	40594.000000	61.000000	28405.000000	1.0	110.0	0.0	13501.750000
max	82277.000000	147.000000	51897.000000	1.0	110.0	0.0	13925.000000

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	number_of_fields	source
execution_time	1.000000	0.946915	0.999021	NaN	NaN	NaN	0.674617
n_returned	0.946915	1.000000	0.946189	NaN	NaN	NaN	0.482164
n_processed	0.999021	0.946189	1.000000	NaN	NaN	NaN	0.668759
keys_length_in_bytes	NaN	NaN	NaN	NaN	NaN	NaN	NaN
average_document_size_in_bytes	NaN	NaN	NaN	NaN	NaN	NaN	NaN
number_of_fields	NaN	NaN	NaN	NaN	NaN	NaN	NaN
source	0.674617	0.482164	0.668759	NaN	NaN	NaN	1.000000

Omnibus:	311.031	Durbin-Watson:	0.432
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1720.593
Skew:	1.654	Prob(JB):	0.00
Kurtosis:	9.327	Cond. No.	4.27e+04

	abt_type	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	number_of_fields	run_id	source
68855	Evaluation	348351	494877	494877	1	110	2	63365dc67919d20f9deb2214	13940
68859	Evaluation	348820	494877	494877	1	110	2	63365dc67919d20f9deb2214	13941
68863	Evaluation	344049	494877	494877	1	110	2	63365dc67919d20f9deb2214	13942
68867	Evaluation	344654	494877	494877	1	110	2	63365dc67919d20f9deb2214	13943
68871	Evaluation	347065	494877	494877	1	110	2	63365dc67919d20f9deb2214	13944

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	number_of_fields	source
count	1.530000e+03	1.530000e+03	1.530000e+03	1530.000000	1530.000000	1530.000000	1530.000000
mean	2.096814e+05	7.762058e+06	7.762058e+06	0.500000	87.705882	6.000000	14789.492810
std	3.683440e+05	8.449659e+06	8.449659e+06	0.500163	22.304124	4.001308	490.947636
min	3.319000e+03	4.466000e+03	4.466000e+03	0.000000	65.000000	2.000000	13940.000000
25%	1.581975e+04	2.597320e+05	2.597320e+05	0.000000	65.000000	2.000000	14365.250000
50%	2.870100e+04	3.737672e+06	3.737672e+06	0.500000	88.000000	6.000000	14789.500000
75%	2.244522e+05	1.500806e+07	1.500806e+07	1.000000	110.000000	10.000000	15213.750000
max	1.729344e+06	2.499542e+07	2.499542e+07	1.000000	110.000000	10.000000	15638.000000

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	number_of_fields	source
execution_time	1.000000	-0.383822	-0.383822	0.511727	0.511709	-0.511727	-0.340920
n_returned	-0.383822	1.000000	1.000000	-0.856928	-0.854474	0.856928	0.928525
n_processed	-0.383822	1.000000	1.000000	-0.856928	-0.854474	0.856928	0.928525
keys_length_in_bytes	0.511727	-0.856928	-0.856928	1.000000	0.999878	-1.000000	-0.866017
average_document_size_in_bytes	0.511709	-0.854474	-0.854474	0.999878	1.000000	-0.999878	-0.864298
number_of_fields	-0.511727	0.856928	0.856928	-1.000000	-0.999878	1.000000	0.866017
source	-0.340920	0.928525	0.928525	-0.866017	-0.864298	0.866017	1.000000

Omnibus:	540.865	Durbin-Watson:	0.182
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1874.829
Skew:	1.743	Prob(JB):	0.00
Kurtosis:	7.154	Cond. No.	6.06e+07

	abt_type	execution_time	n_returned	n_processed	average_document_size_in_bytes	number_of_fields	run_id	source
73123	Evaluation	9996	4999248	4999248	65	10	633664ec7919d20f9deb25bb	14791
73127	Evaluation	6521	4999248	4999248	65	10	633664ec7919d20f9deb25bb	14792
73131	Evaluation	6387	4999248	4999248	65	10	633664ec7919d20f9deb25bb	14793
73135	Evaluation	6455	4999248	4999248	65	10	633664ec7919d20f9deb25bb	14794
73139	Evaluation	6498	4999248	4999248	65	10	633664ec7919d20f9deb25bb	14795