%run prelude.ipy
import statsmodels.formula.api as sm
from statsmodels import graphics
from statsmodels.api import families
from patsy import Treatment
def program_names(df):
"""Returns a list of program_version names from a pandas DataFrame."""
return list(df.apply(lambda r: "{0}_{1}".format(r["base"], r["version"]), axis=1).values)
def names_distance(df1, df2):
"""Returns the normalized edit distance between program_version names in the two pandas DataFrames."""
from nltk.metrics import edit_distance
names1 = program_names(df1)
names2 = program_names(df2)
idxs1 = np.arange(len(names1))
idxs2 = np.array([names1.index(n) for n in names2])
return edit_distance(idxs1, idxs2) / float(len(idxs1))
def round_fmt(x):
return str(np.round(x, 2))
trials["norm_duration_ms"] = trials.duration_ms / trials.code_lines
trials["response_ge_half"] = trials.response_proportion >= 0.5
trials["keystroke_ge_one"] = trials.keystroke_coefficient >= 1
duration_meds = {}
for base, version in programs[["base", "version"]].values:
ts = util.filter_program(trials, base, version)
duration_meds[(base, version)] = ts.duration_ms.median()
duration_all_med = trials.duration_ms.median()
#trials["duration_ge_med"] = trials.apply(lambda t: t["duration_ms"] >= duration_meds[(t["base"], t["version"])], axis=1)
trials["duration_ge_med"] = trials.duration_ms >= duration_all_med
complexity_cols = ["code_lines", "cyclo_comp", "hal_effort", "hal_volume"]
performance_cols = ["grade_value", "duration_ms", "norm_duration_ms", "keystroke_coefficient","response_proportion"]
demo_cols = ["age", "py_years", "prog_years", "degree_num", "cs_major_num"]
Programs and their Complexity Metrics
Let's start off by looking at metrics for each of the programs. The following metrics are shown below:
-
code_chars
- number of characters in the code -
code_lines
- number of lines in the program (includes blank lines) -
cyclo_comp
- McCabe's Cyclomatic Complexity (computed with PyMetrics 0.8.1) -
hal_effort
- Halstead's Effort (computed with PyMetrics 0.8.1) -
hal_volume
- Halstead's Volume (computed with PyMetrics 0.8.1) -
output_chars
- number of characters in the correct output -
output_lines
- number of lines in the correct output
show_dataframe(programs)
Of the 7 metrics listed above, only 4 commonly used as measures of complexity: lines of code, Cyclomatic Complexity, Halstead Effort, and Halstead Volume. We will focus on these 4 metrics for the rest of the analysis.
Correlations Between Metrics
Although the 4 complexity metrics measure different aspects of a program's text, they are often found to be highly correlated in large repositories of program code. Let's see how correlated they are in our 25 programs.
fmts = { c: round_fmt for c in complexity_cols }
print programs[complexity_cols].corr("pearson").to_latex(formatters=fmts)
We can see strong correlations between the Halstead metrics and everything else. There is also a medium correlation between lines of code and Cyclomatic Complexity (usually higher in larger code bases). Because the two Halstead metrics are so highly correlated with each other, we will only consider Halstead Effort moving forward.
Visualizing the Complexity Space
With 3 complexity metrics remaining, we can visualize our programs as points in a 3D complexity space. Below is a series of plots from different angles of this space.
xs, ys, zs = programs.code_lines, programs.cyclo_comp, programs.hal_effort / 1000.0
def plot_metrics(ax):
ax.set_xlabel("Lines of Code")
ax.set_ylabel("Cyclomatic Complexity")
ax.set_zlabel("Halstead Effort")
ax.scatter3D(xs, ys, zs)
fig = plot.misc.plot3d_views(plot_metrics, figsize=(20, 10))
fig.tight_layout()
fig.suptitle("Code Metrics Scatter Plot (all programs)")
fig
From the plots, we can see that our programs are fairly spread out along the lines of code and Cyclomatic Complexity axes. Only a few have high Halstead Effort, but this is not surprising given only a few have many lines of code (and these two metrics are strongly correlated).
Predicting Performance
If our 3 complexity metrics (lines of code, Cyclomatic Complexity, and Halstead Effort) are truly measuring the cognitive complexity of their respective programs, then we should be able to predict participants' performance using them.
For example, lines of code is often used as a proxy for cognitive complexity by assuming that human short-term memory is limited. More lines of code places a cognitive burden on the reader and, therefore, longer programs should be harder to understand.
We will attempt to use each complexity metric to predict performance in 4 different categories:
-
Grade
- A grade of 7 or higher (out of 10) is correct
- More complex programs should result in a lower grade
-
Trial duration
- Time from start to finish (reading + responding)
- More complex programs should take longer to read and respond to (higher duration)
-
Keystroke coefficient
- Number of actual keystrokes / required keystrokes
- More complex programs should require more keystrokes due to mistakes/corrections (higher coefficient)
-
Response proportion
- Time spent responding / trial duration
- More complex programs should require more reading time up front (higher proportion)
To predict performance, we will order the list of programs using the complexity metric and performance metric. The normalized edit distance between these two orderings will low (close to 0) if the orderings are very close and high (close to 1.0) otherwise.
# Compute median trial duration by program (all trials)
med_duration_all = pandas.DataFrame(
trials.groupby(["base", "version"])\
.apply(lambda f: (f["duration_ms"].median() / 1000.0) / f["code_lines"].mean())
.order(ascending=False), columns=["sec_per_line"])\
.reset_index()
# Compute median trial duration by program (correct trials only)
med_duration_correct = pandas.DataFrame(
trials[trials.grade_correct].groupby(["base", "version"])\
.apply(lambda f: (f["duration_ms"].median() / 1000.0) / f["code_lines"].mean())
.order(ascending=False), columns=["sec_per_line"])\
.reset_index()
# Compute median keystroke coefficient by program
med_keystroke_coeff = pandas.DataFrame(
trials.groupby(["base", "version"])\
.apply(lambda f: f["keystroke_coefficient"].median())
.order(ascending=False), columns=["key_coeff"])\
.reset_index()
# Compute median response proportion by program
med_response_prop = pandas.DataFrame(
trials.groupby(["base", "version"])\
.apply(lambda f: f["response_proportion"].median())
.order(ascending=True), columns=["resp_prop"])\
.reset_index()
By Lines of Code
Grade
trials[["code_lines", "grade_value"]].corr()
Trial Duration
trials[["code_lines", "duration_ms"]].corr()
trials[["code_lines", "norm_duration_ms"]].corr()
trials[trials.grade_correct][["code_lines", "norm_duration_ms"]].corr()
Keystroke Coefficient
trials[["code_lines", "keystroke_coefficient"]].corr()
Response Proportion
trials[["code_lines", "response_proportion"]].corr()
By All Metrics (Regressors)
rows = []
for comp_col in complexity_cols:
for perf_col in performance_cols:
for corr_alg in ["pearson", "spearman"]:
for correct_only in [False, True]:
ts = trials[trials.grade_correct] if correct_only else trials
result = ts[[comp_col, perf_col]].corr(corr_alg).values[0, 1]
rows.append([comp_col, perf_col, corr_alg, "correct" if correct_only else "all", result])
corr_df = pandas.DataFrame(rows, columns=["Complexity Metric", "Performance Metric",
"Algorithm", "Trial Filter", "Correlation"])
# Sort descending by absolute value
corr_df = corr_df.reindex(corr_df["Correlation"].abs().order(ascending=False).index)
show_dataframe(corr_df)
rows = []
for comp_col in complexity_cols:
for perf_col in [c for c in performance_cols if c != "norm_duration_ms"]:
result = trials[[comp_col, perf_col]].corr().values[0, 1]
rows.append([comp_col, perf_col, result])
corr_df = pandas.DataFrame(rows, columns=["Complexity Metric", "Performance Metric", "Correlation"])
# Sort descending by absolute value
#corr_df = corr_df.reindex(corr_df["Correlation"].abs().order(ascending=False).index)
#show_dataframe(corr_df)
fmts = [round_fmt] * (len(performance_cols) - 1)
print corr_df.pivot("Complexity Metric", "Performance Metric", "Correlation").to_latex(formatters=fmts)
from eyecode import classify
cp_cols = performance_cols #["duration_ms", "response_proportion"]
rows = len(cp_cols)
fig, axes = pyplot.subplots(rows, 2, figsize=(15, 5 * rows))
for i, perf_col in enumerate(cp_cols):
# Feature importance
fi_df = classify.feature_importances(trials, complexity_cols, perf_col, regressor=True)
ax = plot.misc.feature_importances(fi_df, ax=axes[i, 0])
ax.set_title("Feature Importances ({0})".format(perf_col))
# Cross-validation
cv_df = classify.cross_validation(trials, complexity_cols, perf_col, regressor=True)
ax = plot.misc.cross_validation(cv_df, ax=axes[i, 1])
ax.set_title("$R^2$ ({0}, CV=10)".format(perf_col))
print perf_col, classify.cross_val_performance(cv_df).mean()
fig.tight_layout()
fig
By All Metrics (Classifiers)
from eyecode import classify
importances = classify.feature_importances(trials, complexity_cols, "grade_correct")
cross_val = classify.cross_validation(trials, complexity_cols, "grade_correct", repeat=10)
axes = plot.misc.importances_and_crossval(importances, cross_val, "grade_correct",
figsize=(12, 5), repeat=10)
axes[0].figure.tight_layout()
axes[0].figure
importances = classify.feature_importances(trials, complexity_cols + demo_cols, "grade_correct")
cross_val = classify.cross_validation(trials, complexity_cols + demo_cols, "grade_correct", repeat=10)
axes = plot.misc.importances_and_crossval(importances, cross_val, "grade_correct",
figsize=(12, 5), repeat=10)
axes[0].figure.tight_layout()
axes[0].figure
stats.wilcox_test(cross_val[cross_val.classifier == "dummy"].score, classify.cross_val_performance(cross_val))
scores = classify.one_at_a_time(trials, complexity_cols + demo_cols, "grade_correct", norm=True)
print zip(complexity_cols + demo_cols, scores)
cp_cols = ["grade_value", "duration_ms", "response_proportion", "keystroke_coefficient"]
rows = len(cp_cols)
fig, axes = pyplot.subplots(rows, 2, figsize=(15, 5 * rows))
for i, perf_col in enumerate(cp_cols):
# Feature importance and cross validation
fi_df = classify.feature_importances(trials, complexity_cols, perf_col, regressor=True)
cv_df = classify.cross_validation(trials, complexity_cols, perf_col, repeat=10, regressor=True)
print perf_col, classify.cross_val_performance(cv_df).mean()
# Plot both
plot.misc.importances_and_crossval(fi_df, cv_df, perf_col,
axes=[axes[i, 0], axes[i, 1]],
repeat=10, regressor=True)
fig.tight_layout()
fig
cp_cols = ["grade_value", "duration_ms", "response_proportion", "keystroke_coefficient"]
rows = len(cp_cols)
fig, axes = pyplot.subplots(rows, 2, figsize=(15, 5 * rows))
for i, perf_col in enumerate(cp_cols):
# Feature importance and cross validation
fi_df = classify.feature_importances(trials, demo_cols, perf_col, regressor=True)
cv_df = classify.cross_validation(trials, demo_cols, perf_col, repeat=10, regressor=True)
print perf_col, classify.cross_val_performance(cv_df).mean()
# Plot both
plot.misc.importances_and_crossval(fi_df, cv_df, perf_col,
axes=[axes[i, 0], axes[i, 1]],
repeat=10, regressor=True)
fig.tight_layout()
fig
from eyecode import classify
cp_cols = ["grade_correct", "duration_ge_med", "response_ge_half", "keystroke_ge_one"]
rows = len(cp_cols)
fig, axes = pyplot.subplots(rows, 2, figsize=(15, 5 * rows))
for i, perf_col in enumerate(cp_cols):
# Feature importance and cross validation
fi_df = classify.feature_importances(trials, complexity_cols, perf_col)
cv_df = classify.cross_validation(trials, complexity_cols, perf_col, repeat=10)
print perf_col, classify.cross_val_performance(cv_df).mean()
# Plot both
plot.misc.importances_and_crossval(fi_df, cv_df, perf_col,
axes=[axes[i, 0], axes[i, 1]],
repeat=10)
fig.tight_layout()
fig