%run prelude.ipy

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

from eyecode.plot.kelly_colors import kelly_colors
trials["norm_duration_ms"] = trials.duration_ms / trials.code_lines
performance_cols = ["grade_value", "duration_ms", "norm_duration_ms", "keystroke_coefficient","response_proportion"]
demographic_cols = ["age", "py_years", "prog_years", "degree_num"]
trials = trials.merge(experiments[["exp_id"] + demographic_cols], on="exp_id")

Performance Measures

Grade
- A grade of 7 or higher (out of 10) is correct
- More complex programs should result in a lower grade
Trial duration
- Time from start to finish (reading + responding)
- More complex programs should take longer to read and respond to (higher duration)
Keystroke coefficient
- Number of actual keystrokes / required keystrokes
- More complex programs should require more keystrokes due to mistakes/corrections (higher coefficient)
Response proportion
- Time spent responding / trial duration
- More complex programs should require more reading time up front (higher proportion)

base = "between"
versions = programs[programs.base == base].version.values
b_trials = util.filter_program(trials, base)
axes = plot.misc.grade_distributions(b_trials)
axes[0].figure

fun_grades, in_grades = [b_trials[b_trials.version == v].grade_value for v in versions]
stats.wilcox_test(fun_grades, in_grades)

0.0953743024936958

rows = []
for col in performance_cols + demographic_cols:
    fun_values, in_values = [b_trials[b_trials.version == v][col] for v in versions]
    p = stats.wilcox_test(fun_values, in_values)
    rows.append([col, p, p < 0.05])
    
df = pandas.DataFrame(rows, columns=["Measure", "P-Value", "Significant?"])
show_dataframe(df)

common, not_common = util.split_by_boolean(b_trials, "grade_common")
fig, axes = pyplot.subplots(2, 2)

# Py years
common.py_years.hist(ax=axes[0, 0], color=kelly_colors[0])
not_common.py_years.hist(ax=axes[0, 1], color=kelly_colors[1])

# Prog years
common.prog_years.hist(ax=axes[1, 0], color=kelly_colors[2])
not_common.prog_years.hist(ax=axes[1, 1], color=kelly_colors[3])

fig.tight_layout()
fig

stats.wilcox_test(common.py_years, not_common.py_years)

0.025221174818262103

stats.wilcox_test(common.prog_years, not_common.prog_years)

0.020540499771723515

Grade Distributions (all programs)

for base in programs.base:
    versions = programs[programs.base == base].version.values
    b_trials = util.filter_program(trials, base)
    width = (len(versions) + 1) * 4
    axes = plot.misc.grade_distributions(b_trials, figsize=(width, 4))
    fig = axes[0].figure
    fig.tight_layout()
    fig.savefig("../../../Journal Article/figures/{0}_grade_distributions.png".format(base))
    pyplot.close("all")

Duration Distributions (all programs)

for base in programs.base:
    versions = programs[programs.base == base].version.values
    b_trials = util.filter_program(trials, base)
    width = (len(versions) + 1) * 4
    axes = plot.misc.duration_distributions(b_trials, figsize=(width, 4), colors=kelly_colors[5:])
    fig = axes[0].figure
    fig.tight_layout()
    fig.savefig("../../../Journal Article/figures/{0}_duration_distributions.png".format(base))
    pyplot.close("all")

Correct Grade Correlations by Base

ax = plot.misc.grade_correlations(trials, figsize=(10, 8))
ax.figure

ax = plot.misc.grade_correlations(trials, figsize=(10, 8))
ax.figure

bases = list(programs.base.unique())
rows = []
for exp_id, exp_trials in trials.groupby("exp_id"):
    row = [exp_id]
    for b in bases:
        t = util.filter_program(exp_trials, b)
        if len(t) > 0:
            row.append(t.grade_correct.values[0])
        else:
            row.append(np.NaN)
    rows.append(row)

df = pandas.DataFrame(rows, columns=["exp_id"] + bases).dropna()

def entropy(X):
    x_values = X.unique()
    h = 0.0
    for x_v in x_values:
        p_x = len(X[X == x_v]) / float(len(X))
        h += (p_x * np.log(p_x))
    return -h

def mutual_information(XY, x_col, y_col):
    x_values = XY[x_col].unique()
    y_values = XY[y_col].unique()
    mi = 0.0
    for x_v in x_values:
        for y_v in y_values:
            p_xy = len(XY[(XY[x_col] == x_v) & (XY[y_col] == y_v)]) / float(len(XY))
            p_x = len(XY[XY[x_col] == x_v]) / float(len(XY))
            p_y = len(XY[XY[y_col] == y_v]) / float(len(XY))
            #print p_xy, p_x, x_v, p_y, y_v
            mi_part = (p_xy * np.log((p_xy / (p_x * p_y))))
            if not np.isnan(mi_part):
                mi += mi_part
    return mi

import itertools as it
rows = []
for b1, b2 in it.product(bases, bases):
    h_b1 = entropy(df[b1])
    h_b2 = entropy(df[b2])
    mi = mutual_information(df, b1, b2)
    r = mi / float(h_b1 + h_b2)  # redundency
    rows.append([b1, b2, r * 2])
x = pandas.DataFrame(rows, columns=["Base 1", "Base 2", "MI"])
x = x.pivot("Base 1", "Base 2", "MI")
x[(x < .99) & (x >= 0.015)]

import networkx

g = networkx.Graph()
for idx1, row1 in x.iterrows():
    for idx2, row2 in row1.iteritems():
        if idx1 != idx2:
            g.add_edge(idx1, idx2, weight=row2)

fig = pyplot.figure()            
networkx.draw_spring(g, weight="weight.sum")
fig

fig = pyplot.figure(figsize=(8, 8))
ax = pyplot.axes()
pyplot.pcolor(x.values)
ax.set_yticks(np.arange(len(bases)) + 0.5)
ax.set_yticklabels(bases)
ax.set_xticks(np.arange(len(bases)) + 0.5)
ax.set_xticklabels(bases, rotation=90)
pyplot.close(fig)
fig

from scipy.cluster.vq import whiten, kmeans2

obs = []
bases = []
for base, ts in trials.groupby("base"):
    bases.append(base)
    obs.append([ts.grade_value.mean(), ts.duration_ms.mean(),
                ts.keystroke_coefficient.mean(), ts.response_proportion.mean()])
    
w = whiten(obs)
k = 2
_, labels = kmeans2(w, k)
for i in range(k):
    cls = []
    for j, l in enumerate(labels):
        if l == i:
            cls.append(bases[j])
    print cls

['counting', 'overload', 'partition']
['between', 'funcall', 'initvar', 'order', 'rectangle', 'scope', 'whitespace']

Visualizing Performance Metrics

base = "whitespace"
b_trials = util.filter_program(trials, base)
xs, ys, zs = b_trials.grade_value, np.log(b_trials.duration_ms / 1000.0), b_trials.response_proportion
colors = [kelly_colors[b] for b in b_trials.base_num]
def plot_metrics(ax):
    ax.set_xlabel("Grade (0-10)")
    ax.set_ylabel("Duration (log sec)")
    ax.set_zlabel("Response Prop.")
    ax.scatter3D(xs, ys, zs, color=colors)
    
fig = plot.misc.plot3d_views(plot_metrics, figsize=(20, 10))
fig.tight_layout()
fig.suptitle("Performance Scatter Plot ({0})".format(base))
fig

def pareto_frontier_multi(myArray):
    # Sort on first dimension
    myArray = myArray[myArray[:,0].argsort()]
    # Add first row to pareto_frontier
    pareto_frontier = myArray[0:1,:]
    # Test next row against the last row in pareto_frontier
    for row in myArray[1:,:]:
        if sum([row[x] >= pareto_frontier[-1][x]
                for x in range(len(row))]) == len(row):
            # If it is better on all features add the row to pareto_frontier
            pareto_frontier = np.concatenate((pareto_frontier, [row]))
    return pareto_frontier

x = trials[["grade_value", "duration_ms", "keystroke_coefficient", "response_proportion"]].copy()
x["grade_value"] = 10 - x["grade_value"]
x["duration_ms"] = np.log(x["duration_ms"])
for col in x.columns:
    x[col] /= x[col].max()

pareto_frontier_multi(x.values)

array([[ 0.        ,  0.77521026,  0.09565217,  0.48357515],
       [ 0.        ,  0.84724094,  0.14855072,  0.82240144],
       [ 0.        ,  0.86840344,  0.81884058,  0.89824064]])

x.values

array([[  0.        ,  10.62132735,   2.2       ,   0.46043902],
       [  0.        ,  10.1266311 ,   1.375     ,   0.71836   ],
       [  1.        ,  10.73639668,   1.8       ,   0.39523913],
       ..., 
       [  1.        ,  10.1266311 ,   1.66666667,   0.6782    ],
       [  0.        ,  10.77895629,   1.        ,   0.1728125 ],
       [ 10.        ,  10.98529272,  11.        ,   0.27088136]])

Measure	P-Value	Significant?
grade_value	0.095374	False
duration_ms	0.805100	False
norm_duration_ms	0.001564	True
keystroke_coefficient	0.173840	False
response_proportion	0.040996	True
age	0.073806	False
py_years	0.421903	False
prog_years	0.626145	False
degree_num	0.717689	False

	base	version	code_chars	code_lines	cyclo_comp	hal_effort	hal_volume	output_chars	output_lines	category	category_num
0	between	functions	496	24	7	94192.063393	830.218507	33	3	notation	0
1	between	inline	365	19	7	45596.278445	660.815630	33	3	notation	0
2	counting	nospace	77	3	2	738.402323	82.044703	116	8	notation	0
3	counting	twospaces	81	5	2	738.402323	82.044703	116	8	notation	0
4	funcall	nospace	50	4	2	937.653743	109.392937	3	1	notation	0
5	funcall	space	54	4	2	937.653743	109.392937	3	1	notation	0
6	funcall	vars	72	7	2	1735.731282	154.287225	3	1	notation	0
7	initvar	bothbad	103	9	3	3212.495182	212.396376	5	2	expectation	1
8	initvar	good	103	9	3	3212.495182	212.396376	6	2	expectation	1
9	initvar	onebad	103	9	3	2866.823438	208.496250	6	2	expectation	1
10	order	inorder	137	14	4	8372.306047	303.069902	6	1	notation	0
11	order	shuffled	137	14	4	8372.306047	303.069902	6	1	notation	0
12	overload	multmixed	78	11	1	2340.000000	120.000000	9	3	expectation	1
13	overload	plusmixed	78	11	1	3428.296498	117.206718	7	3	expectation	1
14	overload	strings	98	11	1	3428.296498	117.206718	21	3	expectation	1
15	partition	balanced	105	5	4	2896.001287	188.869649	26	4	expectation	1
16	partition	unbalanced	102	5	4	2382.342809	177.199052	19	3	expectation	1
17	partition	unbalanced_pivot	120	6	4	2707.766879	196.214991	19	3	expectation	1
18	rectangle	basic	293	18	2	18801.174998	396.335705	7	2	notation	0
19	rectangle	class	421	21	5	43203.698685	620.148785	7	2	notation	0
20	rectangle	tuples	277	14	2	15627.749381	403.817813	7	2	notation	0
21	scope	diffname	144	12	3	2779.714286	188.000000	2	1	expectation	1
22	scope	samename	156	12	3	2413.342134	183.623858	2	1	expectation	1
23	whitespace	linedup	275	14	1	6480.000000	216.000000	13	3	notation	0
24	whitespace	zigzag	259	14	1	6480.000000	216.000000	13	3	notation	0

Analysis By Program

Performance Measures

Grade Distributions (all programs)

Duration Distributions (all programs)

Correct Grade Correlations by Base

Category

Visualizing Performance Metrics

Base 2	between	counting	funcall	initvar	order	overload	partition	rectangle	scope	whitespace
Base 1
between	NaN	NaN	0.016650	NaN	0.020042	NaN	NaN	NaN	NaN	NaN
counting	NaN	NaN	NaN	NaN	NaN	0.019437	NaN	NaN	0.019824	NaN
funcall	0.016650	NaN	NaN	NaN	NaN	NaN	0.050151	NaN	NaN	NaN
initvar	NaN	NaN	NaN	NaN	0.056465	0.019551	0.074257	NaN	0.032393	NaN
order	0.020042	NaN	NaN	0.056465	NaN	NaN	0.025033	0.036383	0.020469	NaN
overload	NaN	0.019437	NaN	0.019551	NaN	NaN	NaN	0.029870	0.023111	0.016487
partition	NaN	NaN	0.050151	0.074257	0.025033	NaN	NaN	NaN	0.030531	0.019319
rectangle	NaN	NaN	NaN	NaN	0.036383	0.029870	NaN	NaN	NaN	0.028151
scope	NaN	0.019824	NaN	0.032393	0.020469	0.023111	0.030531	NaN	NaN	NaN
whitespace	NaN	NaN	NaN	NaN	NaN	0.016487	0.019319	0.028151	NaN	NaN