%run prelude.ipy

import statsmodels.formula.api as sm
from statsmodels import graphics
from statsmodels.api import families
from patsy import Treatment
from eyecode import classify

y = np.log(trials.duration_ms)
X = trials[["base_num", "version_num"]]
sm.GLM(y, X).fit().summary()

plot.misc.classify_boxplots(trials, ["base_num"], ["duration_ms_log"], regressor=True)

plot.misc.classify_boxplots(trials, ["base_num"], ["duration_ms_log"], regressor=True)

import scipy.stats

base = "overload"
versions = programs[programs.base == base].version.values
b_trials = util.filter_program(trials, base)
axes = plot.misc.respprop_distributions(b_trials)
axes[0].figure

samples = []
for v in versions:
    samples.append(util.filter_program(b_trials, base, v).response_proportion.values)

scipy.stats.mstats.kruskalwallis(*samples)

(1.4703770023136522, 0.47941508015969614)

experiments

&ltclass 'pandas.core.frame.DataFrame'&gt
Int64Index: 162 entries, 0 to 161
Data columns (total 17 columns):
exp_id               162  non-null values
age                  162  non-null values
degree               162  non-null values
gender               162  non-null values
py_years             162  non-null values
prog_years           162  non-null values
cs_major             162  non-null values
difficulty           162  non-null values
guess_correct        162  non-null values
total_grade          162  non-null values
duration_sec         162  non-null values
location             162  non-null values
gender_num           162  non-null values
degree_num           162  non-null values
cs_major_num         162  non-null values
difficulty_num       162  non-null values
guess_correct_num    162  non-null values
dtypes: float64(3), int64(8), object(6)

y = experiments.total_grade
X = experiments[["age", "py_years", "prog_years"]]
sm.GLM(y, X).fit().summary()

trials

&ltclass 'pandas.core.frame.DataFrame'&gt
Int64Index: 1602 entries, 0 to 1601
Data columns (total 25 columns):
trial_id                 1602  non-null values
exp_id                   1602  non-null values
base                     1602  non-null values
version                  1602  non-null values
grade_value              1602  non-null values
grade_category           1602  non-null values
started_ms               1602  non-null values
ended_ms                 1602  non-null values
duration_ms              1602  non-null values
keystroke_duration_ms    1602  non-null values
keystroke_count          1602  non-null values
keystroke_efficiency     1602  non-null values
response_proportion      1602  non-null values
code_chars               1602  non-null values
code_lines               1602  non-null values
cyclo_comp               1602  non-null values
hal_eff                  1602  non-null values
hal_vol                  1602  non-null values
output_chars             1602  non-null values
output_lines             1602  non-null values
true_output              1602  non-null values
pred_output              1602  non-null values
grade_perfect            1602  non-null values
grade_correct            1602  non-null values
grade_common             1602  non-null values
dtypes: bool(3), float64(4), int64(13), object(5)

y = trials.grade_value
X = trials[["code_lines", "cyclo_comp", "hal_eff", "hal_vol", "response_proportion", "duration_ms", "keystroke_efficiency"]]
sm.GLM(y, X).fit().summary()

y = trials.grade_value
X = trials[["duration_ms", "response_proportion"]]
sm.GLM(y, X).fit().summary()

y = trials.grade_correct
X = trials[["code_lines", "cyclo_comp", "hal_eff", "hal_vol", "response_proportion", "duration_ms", "keystroke_efficiency"]]
sm.Logit(y, X).fit().summary()

Optimization terminated successfully.
         Current function value: 848.329276
         Iterations 6

cols = ["code_lines", "cyclo_comp", "hal_eff", "hal_vol", "duration_ms", "response_proportion", "keystroke_efficiency"]
df = classify.feature_importances(trials, cols, "grade_correct")
fig = plot.misc.feature_importances(df).figure
fig.tight_layout()
fig

trials[["response_proportion", "grade_correct"]].corr()

cols = ["code_lines", "cyclo_comp", "hal_eff", "hal_vol", "duration_ms"]
classify.feature_importances(trials, cols, "grade_value", regressor=True)

cols = ["age", "py_years", "prog_years", "degree_num", "gender_num", "cs_major_num"]
df = classify.feature_importances(experiments, cols, "total_grade", regressor=True, num_estimators=1000)
plot.misc.feature_importances(df).figure

cols = ["cyclo_comp", "hal_eff", "hal_vol", "duration_ms", "response_proportion", "keystroke_efficiency"]
df = classify.area_under_curve(trials, cols, "grade_correct")
plot.misc.area_under_curve(df).figure

cols = ["cyclo_comp", "hal_eff", "hal_vol", "duration_ms", "response_proportion", "keystroke_efficiency"]
df = classify.area_under_curve(trials, cols, "grade_value", regressor=True)
plot.misc.area_under_curve(df).figure

Grade from Program Base/Version

def to_int(x):
    return x.astype(int)

rows = []
g = True
for _, df in trials.groupby("exp_id"):
    rows.append({ "base": "_chance", "grade_correct": g})
    g = not g
fake_trials = pandas.concat([trials, pandas.DataFrame(rows)], ignore_index=True)

for b, df in trials.groupby("base"):
    print b, df.grade_correct.sum() / float(len(df))

between 0.40251572327
counting 0.590062111801
funcall 0.888888888889
initvar 0.7375
order 0.925465838509
overload 0.832298136646
partition 0.69375
rectangle 0.949685534591
scope 0.51572327044
whitespace 0.8625

m = sm.glm(formula="to_int(grade_correct) ~ base - 1", data=trials, family=families.Binomial())
fit = m.fit()
fit.summary()

ax = plot.misc.fit_coefficients_base(fit)
ax.figure.tight_layout()
ax.figure

m = sm.logit(formula="to_int(grade_correct) ~ program_name - 1", data=trials)
fit = m.fit()
fit.summary()

Optimization terminated successfully.
         Current function value: 0.473106
         Iterations 8

ax = plot.misc.fit_coefficients_version(fit, figsize=(12, 8))
ax.figure.tight_layout()
ax.figure

means = fit.conf_int().apply(lambda x: np.mean(x), axis=1)
err = fit.conf_int().apply(lambda x: (x[1] - x[0]) / 2.0, axis=1)
names = sorted(trials.program_name.unique())
colors = []
last_base = None
color_i = -1
for n in names:
    base = n.split("_")[0]
    if base != last_base:
        color_i += 1
        last_base = base
    colors.append(kelly_colors[color_i])
    
ax = means.plot(kind="bar", yerr=err, error_kw={ "ecolor": "black"}, color=colors, figsize=(12, 8))
ax.set_title("Logit Coefficients for Correct Grade by Base/Version")
ax.set_ylabel("Coefficient")
ax.set_xlabel("Program Base/Version")
ax.set_xticklabels(names)
fig = ax.figure
fig.tight_layout()
fig

Duration from Program Base/Version

m = sm.ols(formula="duration_ms_log ~ base -1", data=trials)
fit = m.fit()
fit.summary()

means = fit.conf_int().apply(lambda x: np.mean(x), axis=1)
err = fit.conf_int().apply(lambda x: (x[1] - x[0]) / 2.0, axis=1)
ax = means.plot(kind="bar", yerr=err, error_kw={ "ecolor": "black"}, color=kelly_colors)
ax.set_title("OLS Coefficients for Log Duration (ms) by Program")
ax.set_ylabel("Coefficient (95% CI)")
ax.set_ylim(10, 12.2)
ax.set_xlabel("Program Base")
ax.set_xticklabels(sorted(trials.base.unique()))
fig = ax.figure
fig.tight_layout()
fig

m = sm.ols(formula="duration_ms_log ~ program_name -1", data=trials)
fit = m.fit()
fit.summary()

means = fit.conf_int().apply(lambda x: np.mean(x), axis=1) - 10
err = fit.conf_int().apply(lambda x: (x[1] - x[0]) / 2.0, axis=1)
names = sorted(trials.program_name.unique())
colors = []
last_base = None
color_i = -1
for n in names:
    base = n.split("_")[0]
    if base != last_base:
        color_i += 1
        last_base = base
    colors.append(kelly_colors[color_i])
    
ax = means.plot(kind="bar", yerr=err, error_kw={ "ecolor": "black"}, color=colors, figsize=(12, 8))
ax.set_title("OLS Coefficients for Log Duration (ms) by Base/Version")
ax.set_ylabel("Coefficient - 10")
ax.set_xlabel("Program Base/Version")
ax.set_xticklabels(names)
fig = ax.figure
fig.tight_layout()
fig

m = sm.ols(formula="duration_ms_log ~ code_lines", data=trials)
fit = m.fit()
fit.summary()

graphics.regressionplots.plot_fit(fit, 1)

t = trials[trials.response_proportion < .4]
t.grade_value.hist().figure

t = trials[trials.response_proportion >= .4]
t.grade_value.hist().figure

Dep. Variable:	duration_ms	No. Observations:	1602
Model:	GLM	Df Residuals:	1600
Model Family:	Gaussian	Df Model:	1
Link Function:	identity	Scale:	19.9525928208
Method:	IRLS	Log-Likelihood:	-4669.8
Date:	Wed, 06 Nov 2013	Deviance:	31924.
Time:	20:36:14	Pearson chi2:	3.19e+04
No. Iterations:	3

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
base_num	0.9645	0.031	31.293	0.000	0.904 1.025
version_num	0.4365	0.013	34.385	0.000	0.412 0.461

Dep. Variable:	total_grade	No. Observations:	162
Model:	GLM	Df Residuals:	159
Model Family:	Gaussian	Df Model:	2
Link Function:	identity	Scale:	579.368216602
Method:	IRLS	Log-Likelihood:	-743.67
Date:	Thu, 12 Sep 2013	Deviance:	92120.
Time:	19:20:33	Pearson chi2:	9.21e+04
No. Iterations:	3

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
age	2.5830	0.120	21.566	0.000	2.348 2.818
py_years	4.4059	1.055	4.175	0.000	2.337 6.474
prog_years	-1.1169	0.392	-2.847	0.005	-1.886 -0.348

Dep. Variable:	grade_value	No. Observations:	1602
Model:	GLM	Df Residuals:	1595
Model Family:	Gaussian	Df Model:	6
Link Function:	identity	Scale:	13.6558380381
Method:	IRLS	Log-Likelihood:	-4363.6
Date:	Thu, 12 Sep 2013	Deviance:	21781.
Time:	19:23:46	Pearson chi2:	2.18e+04
No. Iterations:	3

Dep. Variable:	grade_correct	No. Observations:	1602
Model:	Logit	Df Residuals:	1595
Method:	MLE	Df Model:	6
Date:	Thu, 12 Sep 2013	Pseudo R-squ.:	0.07538
Time:	19:28:13	Log-Likelihood:	-848.33
converged:	True	LL-Null:	-917.49
		LLR p-value:	2.259e-27

	coef	std err	z	P>\|z\|	[95.0% Conf. Int.]
code_lines	0.0350	0.023	1.511	0.131	-0.010 0.080
cyclo_comp	-0.4135	0.074	-5.552	0.000	-0.560 -0.268
hal_eff	-9.632e-05	9.56e-06	-10.072	0.000	-0.000 -7.76e-05
hal_vol	0.0112	0.002	5.941	0.000	0.007 0.015
response_proportion	0.5612	0.201	2.797	0.005	0.168 0.954
duration_ms	1.905e-06	1.02e-06	1.867	0.062	-9.52e-08 3.91e-06
keystroke_efficiency	-0.1560	0.034	-4.617	0.000	-0.222 -0.090

	response_proportion	grade_correct
response_proportion	1.000000	-0.044063
grade_correct	-0.044063	1.000000

	column	importance	importance_std
0	cyclo_comp	0.359428	0.273018
1	hal_vol	0.245420	0.235357
2	hal_eff	0.209919	0.219426
3	code_lines	0.124042	0.148497
4	duration_ms	0.061191	0.013429

Dep. Variable:	to_int(grade_correct)	No. Observations:	1602
Model:	GLM	Df Residuals:	1592
Model Family:	Binomial	Df Model:	9
Link Function:	logit	Scale:	1.0
Method:	IRLS	Log-Likelihood:	-784.74
Date:	Thu, 07 Nov 2013	Deviance:	1569.5
Time:	15:32:19	Pearson chi2:	1.60e+03
No. Iterations:	7

Dep. Variable:	duration_ms_log	R-squared:	0.997
Model:	OLS	Adj. R-squared:	0.997
Method:	Least Squares	F-statistic:	6.035e+04
Date:	Thu, 07 Nov 2013	Prob (F-statistic):	0.00
Time:	11:17:38	Log-Likelihood:	-1357.4
No. Observations:	1602	AIC:	2735.
Df Residuals:	1592	BIC:	2789.
Df Model:	10

Omnibus:	155.980	Durbin-Watson:	1.140
Prob(Omnibus):	0.000	Jarque-Bera (JB):	218.183
Skew:	0.758	Prob(JB):	4.19e-48
Kurtosis:	3.985	Cond. No.	1.01

Omnibus:	169.169	Durbin-Watson:	1.139
Prob(Omnibus):	0.000	Jarque-Bera (JB):	243.154
Skew:	0.795	Prob(JB):	1.58e-53
Kurtosis:	4.056	Cond. No.	1.38

Omnibus:	20.175	Durbin-Watson:	1.407
Prob(Omnibus):	0.000	Jarque-Bera (JB):	20.790
Skew:	0.276	Prob(JB):	3.06e-05
Kurtosis:	2.921	Cond. No.	28.7

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
code_lines	0.5147	0.035	14.752	0.000	0.446 0.583
cyclo_comp	0.8050	0.097	8.336	0.000	0.616 0.994
hal_eff	-0.0002	1.26e-05	-15.836	0.000	-0.000 -0.000
hal_vol	0.0021	0.002	0.855	0.393	-0.003 0.007
response_proportion	2.8366	0.314	9.023	0.000	2.220 3.453
duration_ms	3.741e-06	1.37e-06	2.733	0.006	1.06e-06 6.42e-06
keystroke_efficiency	-0.1747	0.056	-3.142	0.002	-0.284 -0.066

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
base[between]	-0.3950	0.162	-2.443	0.015	-0.712 -0.078
base[counting]	0.3642	0.160	2.273	0.023	0.050 0.678
base[funcall]	2.0794	0.250	8.318	0.000	1.589 2.569
base[initvar]	1.0330	0.180	5.749	0.000	0.681 1.385
base[order]	2.5190	0.300	8.395	0.000	1.931 3.107
base[overload]	1.6020	0.211	7.594	0.000	1.189 2.015
base[partition]	0.8177	0.172	4.768	0.000	0.482 1.154
base[rectangle]	2.9378	0.363	8.098	0.000	2.227 3.649
base[scope]	0.0629	0.159	0.396	0.692	-0.248 0.374
base[whitespace]	1.8362	0.230	7.999	0.000	1.386 2.286

	coef	std err	z	P>\|z\|	[95.0% Conf. Int.]
program_name[between_functions]	-0.6931	0.255	-2.714	0.007	-1.194 -0.193
program_name[between_inline]	-0.1782	0.212	-0.842	0.400	-0.593 0.237
program_name[counting_nospace]	1.3581	0.264	5.139	0.000	0.840 1.876
program_name[counting_twospaces]	-0.6523	0.247	-2.645	0.008	-1.136 -0.169
program_name[funcall_nospace]	2.0369	0.434	4.693	0.000	1.186 2.888
program_name[funcall_space]	1.5686	0.348	4.513	0.000	0.887 2.250
program_name[funcall_vars]	3.2189	0.721	4.464	0.000	1.806 4.632
program_name[initvar_bothbad]	1.5640	0.367	4.267	0.000	0.846 2.282
program_name[initvar_good]	0.6931	0.289	2.401	0.016	0.127 1.259
program_name[initvar_onebad]	0.9555	0.304	3.145	0.002	0.360 1.551
program_name[order_inorder]	2.3979	0.426	5.624	0.000	1.562 3.234
program_name[order_shuffled]	2.6271	0.423	6.214	0.000	1.799 3.456
program_name[overload_multmixed]	1.7148	0.362	4.736	0.000	1.005 2.424
program_name[overload_plusmixed]	1.7430	0.410	4.254	0.000	0.940 2.546
program_name[overload_strings]	1.3863	0.337	4.112	0.000	0.726 2.047
program_name[partition_balanced]	0.4055	0.289	1.405	0.160	-0.160 0.971
program_name[partition_unbalanced]	1.0745	0.310	3.471	0.001	0.468 1.681
program_name[partition_unbalanced_pivot]	0.9808	0.303	3.240	0.001	0.387 1.574
program_name[rectangle_basic]	3.8286	1.011	3.788	0.000	1.847 5.810
program_name[rectangle_class]	2.6027	0.518	5.023	0.000	1.587 3.618
program_name[rectangle_tuples]	2.8332	0.594	4.769	0.000	1.669 3.998
program_name[scope_diffname]	0.2177	0.221	0.986	0.324	-0.215 0.651
program_name[scope_samename]	-0.1054	0.230	-0.459	0.647	-0.556 0.345
program_name[whitespace_linedup]	2.0513	0.354	5.793	0.000	1.357 2.745
program_name[whitespace_zigzag]	1.6546	0.303	5.466	0.000	1.061 2.248

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
base[between]	11.9904	0.045	266.945	0.000	11.902 12.079
base[counting]	10.8884	0.045	243.931	0.000	10.801 10.976
base[funcall]	10.3847	0.044	233.368	0.000	10.297 10.472
base[initvar]	10.9474	0.045	244.490	0.000	10.860 11.035
base[order]	10.9793	0.045	245.967	0.000	10.892 11.067
base[overload]	10.3769	0.045	232.472	0.000	10.289 10.464
base[partition]	10.6605	0.045	238.083	0.000	10.573 10.748
base[rectangle]	11.2353	0.045	250.133	0.000	11.147 11.323
base[scope]	10.7979	0.045	240.395	0.000	10.710 10.886
base[whitespace]	11.5817	0.045	258.655	0.000	11.494 11.670