Skip to main content

Analysis By Program

In [6]:
%run prelude.ipy
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [2]:
from eyecode.plot.kelly_colors import kelly_colors
trials["norm_duration_ms"] = trials.duration_ms / trials.code_lines
performance_cols = ["grade_value", "duration_ms", "norm_duration_ms", "keystroke_coefficient","response_proportion"]
demographic_cols = ["age", "py_years", "prog_years", "degree_num"]
trials = trials.merge(experiments[["exp_id"] + demographic_cols], on="exp_id")

Performance Measures

  • Grade
    • A grade of 7 or higher (out of 10) is correct
    • More complex programs should result in a lower grade
  • Trial duration
    • Time from start to finish (reading + responding)
    • More complex programs should take longer to read and respond to (higher duration)
  • Keystroke coefficient
    • Number of actual keystrokes / required keystrokes
    • More complex programs should require more keystrokes due to mistakes/corrections (higher coefficient)
  • Response proportion
    • Time spent responding / trial duration
    • More complex programs should require more reading time up front (higher proportion)
In [17]:
base = "between"
versions = programs[programs.base == base].version.values
b_trials = util.filter_program(trials, base)
axes = plot.misc.grade_distributions(b_trials)
axes[0].figure
Out[17]:
In [12]:
fun_grades, in_grades = [b_trials[b_trials.version == v].grade_value for v in versions]
stats.wilcox_test(fun_grades, in_grades)
Out[12]:
0.0953743024936958
In [22]:
rows = []
for col in performance_cols + demographic_cols:
    fun_values, in_values = [b_trials[b_trials.version == v][col] for v in versions]
    p = stats.wilcox_test(fun_values, in_values)
    rows.append([col, p, p < 0.05])
    
df = pandas.DataFrame(rows, columns=["Measure", "P-Value", "Significant?"])
show_dataframe(df)
Out[22]:
Measure P-Value Significant?
grade_value 0.095374 False
duration_ms 0.805100 False
norm_duration_ms 0.001564 True
keystroke_coefficient 0.173840 False
response_proportion 0.040996 True
age 0.073806 False
py_years 0.421903 False
prog_years 0.626145 False
degree_num 0.717689 False
In [32]:
common, not_common = util.split_by_boolean(b_trials, "grade_common")
fig, axes = pyplot.subplots(2, 2)

# Py years
common.py_years.hist(ax=axes[0, 0], color=kelly_colors[0])
not_common.py_years.hist(ax=axes[0, 1], color=kelly_colors[1])

# Prog years
common.prog_years.hist(ax=axes[1, 0], color=kelly_colors[2])
not_common.prog_years.hist(ax=axes[1, 1], color=kelly_colors[3])

fig.tight_layout()
fig
Out[32]:
In [34]:
stats.wilcox_test(common.py_years, not_common.py_years)
Out[34]:
0.025221174818262103
In [33]:
stats.wilcox_test(common.prog_years, not_common.prog_years)
Out[33]:
0.020540499771723515

Grade Distributions (all programs)

In [14]:
for base in programs.base:
    versions = programs[programs.base == base].version.values
    b_trials = util.filter_program(trials, base)
    width = (len(versions) + 1) * 4
    axes = plot.misc.grade_distributions(b_trials, figsize=(width, 4))
    fig = axes[0].figure
    fig.tight_layout()
    fig.savefig("../../../Journal Article/figures/{0}_grade_distributions.png".format(base))
    pyplot.close("all")

Duration Distributions (all programs)

In [22]:
for base in programs.base:
    versions = programs[programs.base == base].version.values
    b_trials = util.filter_program(trials, base)
    width = (len(versions) + 1) * 4
    axes = plot.misc.duration_distributions(b_trials, figsize=(width, 4), colors=kelly_colors[5:])
    fig = axes[0].figure
    fig.tight_layout()
    fig.savefig("../../../Journal Article/figures/{0}_duration_distributions.png".format(base))
    pyplot.close("all")

Correct Grade Correlations by Base

In [111]:
ax = plot.misc.grade_correlations(trials, figsize=(10, 8))
ax.figure
Out[111]:
In [3]:
ax = plot.misc.grade_correlations(trials, figsize=(10, 8))
ax.figure
Out[3]:
In [5]:
bases = list(programs.base.unique())
rows = []
for exp_id, exp_trials in trials.groupby("exp_id"):
    row = [exp_id]
    for b in bases:
        t = util.filter_program(exp_trials, b)
        if len(t) > 0:
            row.append(t.grade_correct.values[0])
        else:
            row.append(np.NaN)
    rows.append(row)

df = pandas.DataFrame(rows, columns=["exp_id"] + bases).dropna()
In [17]:
def entropy(X):
    x_values = X.unique()
    h = 0.0
    for x_v in x_values:
        p_x = len(X[X == x_v]) / float(len(X))
        h += (p_x * np.log(p_x))
    return -h        
In [20]:
def mutual_information(XY, x_col, y_col):
    x_values = XY[x_col].unique()
    y_values = XY[y_col].unique()
    mi = 0.0
    for x_v in x_values:
        for y_v in y_values:
            p_xy = len(XY[(XY[x_col] == x_v) & (XY[y_col] == y_v)]) / float(len(XY))
            p_x = len(XY[XY[x_col] == x_v]) / float(len(XY))
            p_y = len(XY[XY[y_col] == y_v]) / float(len(XY))
            #print p_xy, p_x, x_v, p_y, y_v
            mi_part = (p_xy * np.log((p_xy / (p_x * p_y))))
            if not np.isnan(mi_part):
                mi += mi_part
    return mi
In [60]:
import itertools as it
rows = []
for b1, b2 in it.product(bases, bases):
    h_b1 = entropy(df[b1])
    h_b2 = entropy(df[b2])
    mi = mutual_information(df, b1, b2)
    r = mi / float(h_b1 + h_b2)  # redundency
    rows.append([b1, b2, r * 2])
x = pandas.DataFrame(rows, columns=["Base 1", "Base 2", "MI"])
x = x.pivot("Base 1", "Base 2", "MI")
x[(x < .99) & (x >= 0.015)]
Out[60]:
Base 2 between counting funcall initvar order overload partition rectangle scope whitespace
Base 1
between NaN NaN 0.016650 NaN 0.020042 NaN NaN NaN NaN NaN
counting NaN NaN NaN NaN NaN 0.019437 NaN NaN 0.019824 NaN
funcall 0.016650 NaN NaN NaN NaN NaN 0.050151 NaN NaN NaN
initvar NaN NaN NaN NaN 0.056465 0.019551 0.074257 NaN 0.032393 NaN
order 0.020042 NaN NaN 0.056465 NaN NaN 0.025033 0.036383 0.020469 NaN
overload NaN 0.019437 NaN 0.019551 NaN NaN NaN 0.029870 0.023111 0.016487
partition NaN NaN 0.050151 0.074257 0.025033 NaN NaN NaN 0.030531 0.019319
rectangle NaN NaN NaN NaN 0.036383 0.029870 NaN NaN NaN 0.028151
scope NaN 0.019824 NaN 0.032393 0.020469 0.023111 0.030531 NaN NaN NaN
whitespace NaN NaN NaN NaN NaN 0.016487 0.019319 0.028151 NaN NaN
In [73]:
import networkx
In [92]:
g = networkx.Graph()
for idx1, row1 in x.iterrows():
    for idx2, row2 in row1.iteritems():
        if idx1 != idx2:
            g.add_edge(idx1, idx2, weight=row2)

fig = pyplot.figure()            
networkx.draw_spring(g, weight="weight.sum")
fig
Out[92]:
In [37]:
fig = pyplot.figure(figsize=(8, 8))
ax = pyplot.axes()
pyplot.pcolor(x.values)
ax.set_yticks(np.arange(len(bases)) + 0.5)
ax.set_yticklabels(bases)
ax.set_xticks(np.arange(len(bases)) + 0.5)
ax.set_xticklabels(bases, rotation=90)
pyplot.close(fig)
fig
Out[37]:
In [100]:
from scipy.cluster.vq import whiten, kmeans2
In [142]:
obs = []
bases = []
for base, ts in trials.groupby("base"):
    bases.append(base)
    obs.append([ts.grade_value.mean(), ts.duration_ms.mean(),
                ts.keystroke_coefficient.mean(), ts.response_proportion.mean()])
    
w = whiten(obs)
k = 2
_, labels = kmeans2(w, k)
for i in range(k):
    cls = []
    for j, l in enumerate(labels):
        if l == i:
            cls.append(bases[j])
    print cls
['counting', 'overload', 'partition']
['between', 'funcall', 'initvar', 'order', 'rectangle', 'scope', 'whitespace']

Category

In [146]:
values = list(programs["category"].unique())
programs["category_num"] = programs["category"].apply(lambda v: values.index(v))
programs
Out[146]:
base version code_chars code_lines cyclo_comp hal_effort hal_volume output_chars output_lines category category_num
0 between functions 496 24 7 94192.063393 830.218507 33 3 notation 0
1 between inline 365 19 7 45596.278445 660.815630 33 3 notation 0
2 counting nospace 77 3 2 738.402323 82.044703 116 8 notation 0
3 counting twospaces 81 5 2 738.402323 82.044703 116 8 notation 0
4 funcall nospace 50 4 2 937.653743 109.392937 3 1 notation 0
5 funcall space 54 4 2 937.653743 109.392937 3 1 notation 0
6 funcall vars 72 7 2 1735.731282 154.287225 3 1 notation 0
7 initvar bothbad 103 9 3 3212.495182 212.396376 5 2 expectation 1
8 initvar good 103 9 3 3212.495182 212.396376 6 2 expectation 1
9 initvar onebad 103 9 3 2866.823438 208.496250 6 2 expectation 1
10 order inorder 137 14 4 8372.306047 303.069902 6 1 notation 0
11 order shuffled 137 14 4 8372.306047 303.069902 6 1 notation 0
12 overload multmixed 78 11 1 2340.000000 120.000000 9 3 expectation 1
13 overload plusmixed 78 11 1 3428.296498 117.206718 7 3 expectation 1
14 overload strings 98 11 1 3428.296498 117.206718 21 3 expectation 1
15 partition balanced 105 5 4 2896.001287 188.869649 26 4 expectation 1
16 partition unbalanced 102 5 4 2382.342809 177.199052 19 3 expectation 1
17 partition unbalanced_pivot 120 6 4 2707.766879 196.214991 19 3 expectation 1
18 rectangle basic 293 18 2 18801.174998 396.335705 7 2 notation 0
19 rectangle class 421 21 5 43203.698685 620.148785 7 2 notation 0
20 rectangle tuples 277 14 2 15627.749381 403.817813 7 2 notation 0
21 scope diffname 144 12 3 2779.714286 188.000000 2 1 expectation 1
22 scope samename 156 12 3 2413.342134 183.623858 2 1 expectation 1
23 whitespace linedup 275 14 1 6480.000000 216.000000 13 3 notation 0
24 whitespace zigzag 259 14 1 6480.000000 216.000000 13 3 notation 0

Visualizing Performance Metrics

In [13]:
base = "whitespace"
b_trials = util.filter_program(trials, base)
xs, ys, zs = b_trials.grade_value, np.log(b_trials.duration_ms / 1000.0), b_trials.response_proportion
colors = [kelly_colors[b] for b in b_trials.base_num]
def plot_metrics(ax):
    ax.set_xlabel("Grade (0-10)")
    ax.set_ylabel("Duration (log sec)")
    ax.set_zlabel("Response Prop.")
    ax.scatter3D(xs, ys, zs, color=colors)
    
fig = plot.misc.plot3d_views(plot_metrics, figsize=(20, 10))
fig.tight_layout()
fig.suptitle("Performance Scatter Plot ({0})".format(base))
fig
Out[13]:
In [14]:
def pareto_frontier_multi(myArray):
    # Sort on first dimension
    myArray = myArray[myArray[:,0].argsort()]
    # Add first row to pareto_frontier
    pareto_frontier = myArray[0:1,:]
    # Test next row against the last row in pareto_frontier
    for row in myArray[1:,:]:
        if sum([row[x] >= pareto_frontier[-1][x]
                for x in range(len(row))]) == len(row):
            # If it is better on all features add the row to pareto_frontier
            pareto_frontier = np.concatenate((pareto_frontier, [row]))
    return pareto_frontier
In [26]:
x = trials[["grade_value", "duration_ms", "keystroke_coefficient", "response_proportion"]].copy()
x["grade_value"] = 10 - x["grade_value"]
x["duration_ms"] = np.log(x["duration_ms"])
for col in x.columns:
    x[col] /= x[col].max()

pareto_frontier_multi(x.values)
Out[26]:
array([[ 0.        ,  0.77521026,  0.09565217,  0.48357515],
       [ 0.        ,  0.84724094,  0.14855072,  0.82240144],
       [ 0.        ,  0.86840344,  0.81884058,  0.89824064]])
In [24]:
x.values
Out[24]:
array([[  0.        ,  10.62132735,   2.2       ,   0.46043902],
       [  0.        ,  10.1266311 ,   1.375     ,   0.71836   ],
       [  1.        ,  10.73639668,   1.8       ,   0.39523913],
       ..., 
       [  1.        ,  10.1266311 ,   1.66666667,   0.6782    ],
       [  0.        ,  10.77895629,   1.        ,   0.1728125 ],
       [ 10.        ,  10.98529272,  11.        ,   0.27088136]])