In [1]:
%run prelude.ipy
In [2]:
import statsmodels.formula.api as sm
from statsmodels import graphics
from statsmodels.api import families
from patsy import Treatment
from eyecode import classify
In [5]:
y = np.log(trials.duration_ms)
X = trials[["base_num", "version_num"]]
sm.GLM(y, X).fit().summary()
Out[5]:
In [15]:
plot.misc.classify_boxplots(trials, ["base_num"], ["duration_ms_log"], regressor=True)
Out[15]:
In [23]:
plot.misc.classify_boxplots(trials, ["base_num"], ["duration_ms_log"], regressor=True)
Out[23]:
In [24]:
import scipy.stats
In [55]:
base = "overload"
versions = programs[programs.base == base].version.values
b_trials = util.filter_program(trials, base)
axes = plot.misc.respprop_distributions(b_trials)
axes[0].figure
Out[55]:
In [57]:
samples = []
for v in versions:
samples.append(util.filter_program(b_trials, base, v).response_proportion.values)
In [58]:
scipy.stats.mstats.kruskalwallis(*samples)
Out[58]:
In [31]:
experiments
Out[31]:
In [16]:
y = experiments.total_grade
X = experiments[["age", "py_years", "prog_years"]]
sm.GLM(y, X).fit().summary()
Out[16]:
In [17]:
trials
Out[17]:
In [21]:
y = trials.grade_value
X = trials[["code_lines", "cyclo_comp", "hal_eff", "hal_vol", "response_proportion", "duration_ms", "keystroke_efficiency"]]
sm.GLM(y, X).fit().summary()
Out[21]:
In [41]:
y = trials.grade_value
X = trials[["duration_ms", "response_proportion"]]
sm.GLM(y, X).fit().summary()
Out[41]:
In [23]:
y = trials.grade_correct
X = trials[["code_lines", "cyclo_comp", "hal_eff", "hal_vol", "response_proportion", "duration_ms", "keystroke_efficiency"]]
sm.Logit(y, X).fit().summary()
Out[23]:
In [46]:
cols = ["code_lines", "cyclo_comp", "hal_eff", "hal_vol", "duration_ms", "response_proportion", "keystroke_efficiency"]
df = classify.feature_importances(trials, cols, "grade_correct")
fig = plot.misc.feature_importances(df).figure
fig.tight_layout()
fig
Out[46]:
In [52]:
trials[["response_proportion", "grade_correct"]].corr()
Out[52]:
In [28]:
cols = ["code_lines", "cyclo_comp", "hal_eff", "hal_vol", "duration_ms"]
classify.feature_importances(trials, cols, "grade_value", regressor=True)
Out[28]:
In [36]:
cols = ["age", "py_years", "prog_years", "degree_num", "gender_num", "cs_major_num"]
df = classify.feature_importances(experiments, cols, "total_grade", regressor=True, num_estimators=1000)
plot.misc.feature_importances(df).figure
Out[36]:
In [78]:
cols = ["cyclo_comp", "hal_eff", "hal_vol", "duration_ms", "response_proportion", "keystroke_efficiency"]
df = classify.area_under_curve(trials, cols, "grade_correct")
plot.misc.area_under_curve(df).figure
Out[78]:
In [94]:
cols = ["cyclo_comp", "hal_eff", "hal_vol", "duration_ms", "response_proportion", "keystroke_efficiency"]
df = classify.area_under_curve(trials, cols, "grade_value", regressor=True)
plot.misc.area_under_curve(df).figure
Out[94]:
Grade from Program Base/Version
In []:
def to_int(x):
return x.astype(int)
In [126]:
rows = []
g = True
for _, df in trials.groupby("exp_id"):
rows.append({ "base": "_chance", "grade_correct": g})
g = not g
fake_trials = pandas.concat([trials, pandas.DataFrame(rows)], ignore_index=True)
In [134]:
for b, df in trials.groupby("base"):
print b, df.grade_correct.sum() / float(len(df))
In [192]:
m = sm.glm(formula="to_int(grade_correct) ~ base - 1", data=trials, family=families.Binomial())
fit = m.fit()
fit.summary()
Out[192]:
In [193]:
ax = plot.misc.fit_coefficients_base(fit)
ax.figure.tight_layout()
ax.figure
Out[193]:
In [174]:
m = sm.logit(formula="to_int(grade_correct) ~ program_name - 1", data=trials)
fit = m.fit()
fit.summary()
Out[174]:
In [187]:
ax = plot.misc.fit_coefficients_version(fit, figsize=(12, 8))
ax.figure.tight_layout()
ax.figure
Out[187]:
In [82]:
means = fit.conf_int().apply(lambda x: np.mean(x), axis=1)
err = fit.conf_int().apply(lambda x: (x[1] - x[0]) / 2.0, axis=1)
names = sorted(trials.program_name.unique())
colors = []
last_base = None
color_i = -1
for n in names:
base = n.split("_")[0]
if base != last_base:
color_i += 1
last_base = base
colors.append(kelly_colors[color_i])
ax = means.plot(kind="bar", yerr=err, error_kw={ "ecolor": "black"}, color=colors, figsize=(12, 8))
ax.set_title("Logit Coefficients for Correct Grade by Base/Version")
ax.set_ylabel("Coefficient")
ax.set_xlabel("Program Base/Version")
ax.set_xticklabels(names)
fig = ax.figure
fig.tight_layout()
fig
Out[82]:
Duration from Program Base/Version
In [47]:
m = sm.ols(formula="duration_ms_log ~ base -1", data=trials)
fit = m.fit()
fit.summary()
Out[47]:
In [52]:
means = fit.conf_int().apply(lambda x: np.mean(x), axis=1)
err = fit.conf_int().apply(lambda x: (x[1] - x[0]) / 2.0, axis=1)
ax = means.plot(kind="bar", yerr=err, error_kw={ "ecolor": "black"}, color=kelly_colors)
ax.set_title("OLS Coefficients for Log Duration (ms) by Program")
ax.set_ylabel("Coefficient (95% CI)")
ax.set_ylim(10, 12.2)
ax.set_xlabel("Program Base")
ax.set_xticklabels(sorted(trials.base.unique()))
fig = ax.figure
fig.tight_layout()
fig
Out[52]:
In [41]:
m = sm.ols(formula="duration_ms_log ~ program_name -1", data=trials)
fit = m.fit()
fit.summary()
Out[41]:
In [45]:
means = fit.conf_int().apply(lambda x: np.mean(x), axis=1) - 10
err = fit.conf_int().apply(lambda x: (x[1] - x[0]) / 2.0, axis=1)
names = sorted(trials.program_name.unique())
colors = []
last_base = None
color_i = -1
for n in names:
base = n.split("_")[0]
if base != last_base:
color_i += 1
last_base = base
colors.append(kelly_colors[color_i])
ax = means.plot(kind="bar", yerr=err, error_kw={ "ecolor": "black"}, color=colors, figsize=(12, 8))
ax.set_title("OLS Coefficients for Log Duration (ms) by Base/Version")
ax.set_ylabel("Coefficient - 10")
ax.set_xlabel("Program Base/Version")
ax.set_xticklabels(names)
fig = ax.figure
fig.tight_layout()
fig
Out[45]:
In [66]:
m = sm.ols(formula="duration_ms_log ~ code_lines", data=trials)
fit = m.fit()
fit.summary()
Out[66]:
In [67]:
graphics.regressionplots.plot_fit(fit, 1)
Out[67]:
In [208]:
t = trials[trials.response_proportion < .4]
t.grade_value.hist().figure
Out[208]:
In [207]:
t = trials[trials.response_proportion >= .4]
t.grade_value.hist().figure
Out[207]: