#Imports
import math
import seaborn as sns
import statsmodels.api as sm

import sys
sys.path.append('/home/ubuntu/workplace/python_notebook/cost_model')
import experiment as exp
from config import DatabaseConfig
from database_instance import DatabaseInstance
import pandas as pd

from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

database_config = DatabaseConfig(connection_string='mongodb://localhost',
                                     database_name='abt_calibration_equal_interval', dump_path='',
                                     restore_from_dump=False, dump_on_exit=False)

database = DatabaseInstance(database_config)
df = await exp.load_calibration_data(database, 'calibrationDataEqualInterval')


df.describe()


noout_df = exp.remove_outliers(df, 0.0, 0.90)
noout_df.describe()


abt_df = exp.extract_abt_nodes(noout_df)
abt_df.count()

abt_type                          20250
execution_time                    20250
n_returned                        20250
n_processed                       20250
keys_length_in_bytes              20250
average_document_size_in_bytes    20250
run_id                            20250
dtype: int64


abt_df.abt_type.unique()

array(['IndexScan', 'Root', 'BinaryJoin', 'LimitSkip', 'Seek'],
      dtype=object)


idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
idx_scan_df.head()


idx_scan_df.describe()


idx_scan_df.corr()


sns.scatterplot(x=idx_scan_df['n_processed'], y=idx_scan_df['execution_time'])

<AxesSubplot:xlabel='n_processed', ylabel='execution_time'>


y = idx_scan_df['execution_time']
X = idx_scan_df[['n_processed']]
X = sm.add_constant(X)
idx_scan_lm = sm.OLS(y, X).fit()
idx_scan_lm.summary()


y = idx_scan_df['execution_time']
X = idx_scan_df[['n_processed']]
X = sm.add_constant(X)
idx_scan_glm = sm.GLM(y, X).fit_constrained(([1, 0], idx_scan_df.execution_time.min()))
idx_scan_glm.summary()


abt_df = exp.extract_abt_nodes_with_field(noout_df, 'str_choice')
abt_df.count()

abt_type                          20250
execution_time                    20250
n_returned                        20250
n_processed                       20250
keys_length_in_bytes              20250
average_document_size_in_bytes    20250
run_id                            20250
dtype: int64


import pandas as pd
def calibrate(abt_node_df: pd.DataFrame, variables: list[str] = None):
    """Calibrate the ABT node given in abd_node_df with the given model input variables."""
    if variables is None:
        variables = ['n_processed']
    y = abt_node_df['execution_time']
    X = abt_node_df[variables]
    X = sm.add_constant(X)

    nnls = LinearRegression(positive=True, fit_intercept=False)
    model = nnls.fit(X, y)
    y_pred = model.predict(X)
    print(f'R2: {r2_score(y, y_pred)}')
    print(f'Coefficients: {model.coef_}')

    sns.scatterplot(x=abt_node_df['n_processed'], y=abt_node_df['execution_time'])
    sns.lineplot(x=abt_node_df['n_processed'],y=y_pred, color='red')


calibrate(idx_scan_df, ['n_processed', 'keys_length_in_bytes'])

R2: 0.5987985329783202
Coefficients: [0.00000000e+00 5.37292461e-03 5.80927078e+00]


abt_df = exp.extract_abt_nodes_with_field(noout_df, 'str_choice')
abt_df.count()

abt_type                          20250
execution_time                    20250
n_returned                        20250
n_processed                       20250
keys_length_in_bytes              20250
average_document_size_in_bytes    20250
run_id                            20250
dtype: int64


abt_df.abt_type.unique()

array(['IndexScan', 'Root', 'BinaryJoin', 'LimitSkip', 'Seek'],
      dtype=object)


idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
calibrate(idx_scan_df, ['n_processed'])   # cost: 0.0026

R2: 0.19136805765412557
Coefficients: [2.68750630e+03 3.82693539e-03]


abt_df = exp.extract_abt_nodes_with_field(noout_df, 'str_uniform')
abt_df.count()

Series([], dtype: int64)


idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
calibrate(idx_scan_df, ['n_processed'])

R2: 0.3794698927458856
Coefficients: [5.7848001 0.0433854]


abt_df = exp.extract_abt_nodes_with_field(noout_df, 'int_normal1')
abt_df.count()

abt_type                          14846
execution_time                    14846
n_returned                        14846
n_processed                       14846
keys_length_in_bytes              14846
average_document_size_in_bytes    14846
run_id                            14846
dtype: int64


idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
calibrate(idx_scan_df, ['n_processed'])

R2: 0.2893741110299918
Coefficients: [2.28274307e+02 2.91162622e-02]


abt_df = exp.extract_abt_nodes_with_field(noout_df, 'str_choice_200_')
abt_df.count()

abt_type                          18000
execution_time                    18000
n_returned                        18000
n_processed                       18000
keys_length_in_bytes              18000
average_document_size_in_bytes    18000
run_id                            18000
dtype: int64


idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
calibrate(idx_scan_df, ['n_processed'])

R2: 0.2816592524864636
Coefficients: [2.41369410e+02 2.90449573e-02]


calibrate(idx_scan_df, ['n_processed', 'keys_length_in_bytes'])

R2: 0.590026568940176
Coefficients: [0.         0.01354941 4.61593592]


abt_df = exp.extract_abt_nodes_with_field(noout_df, 'str_choice_50_')
abt_df.count()

abt_type                          2250
execution_time                    2250
n_returned                        2250
n_processed                       2250
keys_length_in_bytes              2250
average_document_size_in_bytes    2250
run_id                            2250
dtype: int64


idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
calibrate(idx_scan_df, ['n_processed'])

R2: 0.28494934654418846
Coefficients: [0.         0.00547649]


abt_df = exp.extract_abt_nodes_with_field(noout_df, 'str_choice_200_')
abt_df.count()

abt_type                          18000
execution_time                    18000
n_returned                        18000
n_processed                       18000
keys_length_in_bytes              18000
average_document_size_in_bytes    18000
run_id                            18000
dtype: int64


idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
calibrate(idx_scan_df, ['n_processed'])

R2: 0.2893741110299918
Coefficients: [2.28274307e+02 2.91162622e-02]


df = await exp.load_calibration_data(database, 'calibrationDataEqualIntervalSmall')

noout_df = exp.remove_outliers(df, 0.0, 0.90)
noout_df.describe()

abt_df = exp.extract_abt_nodes_with_field(noout_df, 'str_choice_20_1')
#abt_df.count()
noout_df.describe()


idx_scan_df = abt_df[abt_df.abt_type == 'IndexScan']
calibrate(idx_scan_df, ['n_processed'])

R2: 0.14011342736457788
Coefficients: [15.38671068  0.02208039]

	total_execution_time
count	4.500000e+03
mean	4.504779e+05
std	5.744749e+05
min	4.152000e+03
25%	1.559245e+05
50%	2.913080e+05
75%	4.054750e+05
max	2.389553e+06

	total_execution_time
count	4.050000e+03
mean	4.489712e+05
std	5.742093e+05
min	4.152000e+03
25%	1.557142e+05
50%	2.899445e+05
75%	4.019805e+05
max	2.388043e+06

	abt_type	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes	run_id
0	IndexScan	141	1026	1027	688	1065	633c14def8390e87e2f862a8
5	IndexScan	128	1026	1027	688	1065	633c14def8390e87e2f862a8
10	IndexScan	122	1026	1027	688	1065	633c14def8390e87e2f862a8
15	IndexScan	117	1026	1027	688	1065	633c14def8390e87e2f862a8
20	IndexScan	117	1026	1027	688	1065	633c14def8390e87e2f862a8

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes
count	4050.000000	4.050000e+03	4.050000e+03	4050.000000	4050.000000
mean	3563.027901	2.287778e+05	2.287788e+05	453.388889	959.777778
std	3237.451717	3.700726e+05	3.700726e+05	303.412025	297.650137
min	45.000000	9.500000e+02	9.510000e+02	3.000000	118.000000
25%	702.250000	5.662300e+04	5.662400e+04	158.000000	1065.000000
50%	2861.000000	1.132395e+05	1.132405e+05	438.000000	1065.000000
75%	5556.500000	1.687260e+05	1.687270e+05	718.000000	1065.000000
max	16649.000000	1.500543e+06	1.500544e+06	998.000000	1065.000000

	execution_time	n_returned	n_processed	keys_length_in_bytes	average_document_size_in_bytes
execution_time	1.000000	0.437456	0.437456	0.418959	-0.356969
n_returned	0.437456	1.000000	1.000000	-0.429853	-0.980535
n_processed	0.437456	1.000000	1.000000	-0.429853	-0.980535
keys_length_in_bytes	0.418959	-0.429853	-0.429853	1.000000	0.439227
average_document_size_in_bytes	-0.356969	-0.980535	-0.980535	0.439227	1.000000

IndexScan (less selective queries)¶

IndexScan (more selective queries)¶

IndexScan (on Int fields)¶

IndexScan (Equal interval - Strings)¶

IndexScan (Queries with small "n_processed")¶

Dep. Variable:	execution_time	R-squared:	0.191
Model:	OLS	Adj. R-squared:	0.191
Method:	Least Squares	F-statistic:	958.0
Date:	Wed, 12 Oct 2022	Prob (F-statistic):	5.56e-189
Time:	11:54:53	Log-Likelihood:	-38050.
No. Observations:	4050	AIC:	7.610e+04
Df Residuals:	4048	BIC:	7.612e+04
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	2687.5063	53.790	49.963	0.000	2582.049	2792.964
n_processed	0.0038	0.000	30.951	0.000	0.004	0.004

Omnibus:	895.274	Durbin-Watson:	0.131
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1661.744
Skew:	1.381	Prob(JB):	0.00
Kurtosis:	4.490	Cond. No.	5.11e+05

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	45.0000	0	inf	0.000	45.000	45.000
n_processed	0.0070	0.000	52.850	0.000	0.007	0.007

	total_execution_time
count	84.000000
mean	77.285714
std	29.445711
min	33.000000
25%	51.500000
50%	75.500000
75%	106.000000
max	122.000000

Generalized Linear Model Regression Results
Dep. Variable:	execution_time	No. Observations:	4050
Model:	GLM	Df Residuals:	4049
Model Family:	Gaussian	Df Model:	0
Link Function:	identity	Scale:	1.3528e+07
Method:	IRLS	Log-Likelihood:	-38997.
Date:	Wed, 12 Oct 2022	Deviance:	5.4776e+10
Time:	11:55:11	Pearson chi2:	5.48e+10
No. Iterations:	1	Pseudo R-squ. (CS):	-0.2526
Covariance Type:	nonrobust