{
"cells": [
{
"cell_type": "markdown",
"id": "1e115450",
"metadata": {},
"source": [
"# Predicting student's final exam outcome\n",
"\n",
"This section aims to predict the `student final exam outcome`\n",
"(Pass (score >= 40) / Fail (score < 40)).\n",
"We try to replicate the machine learning analysis techniques from the work of\n",
"Tomasevic et al. (2020) {cite}`tomasevic_2020`.\n",
"\n",
"**Keywords**: Predicting student outcome\n",
"\n",
"```{bibliography}\n",
":filter: docname in docnames\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "7e1dc5f6",
"metadata": {
"lines_to_next_cell": 2
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from IPython.display import Markdown, display\n",
"from sklearn.ensemble import VotingClassifier\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import f1_score\n",
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.neural_network import MLPClassifier\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"from sklearn.svm import SVC\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"from multicons import MultiCons\n",
"\n",
"from oulad import filter_by_module_presentation, get_oulad\n",
"\n",
"%load_ext oulad.capture"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "acc2bd24",
"metadata": {},
"outputs": [],
"source": [
"%%capture oulad\n",
"oulad = get_oulad()"
]
},
{
"cell_type": "markdown",
"id": "c8903118",
"metadata": {
"lines_to_next_cell": 2
},
"source": [
"## Preparing train/test data\n",
"\n",
"### Selecting features\n",
"\n",
"In the work of Tomasevic et al. the student data from the `DDD` module of the\n",
"`2013J` and `2014B` presentations combined is used.\n",
"\n",
"Similarly, we try to select the same seven distinct attributes from the three distinct\n",
"types below:\n",
"\n",
"| DEMOGRAPHIC | ENGAGEMENT | PERFORMANCE |\n",
"|---------------------|--------------------------------|-------------------------|\n",
"| - gender | - sum of clicks per assessment | - scores per assessment |\n",
"| - highest_education | | - number of attempts |\n",
"| - age_band | | - final_exam score |"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b3fc551f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age_band | \n",
" gender | \n",
" highest_education | \n",
" num_of_prev_attempts | \n",
" final_result | \n",
" assessment_1_sum_click | \n",
" assessment_2_sum_click | \n",
" assessment_3_sum_click | \n",
" assessment_4_sum_click | \n",
" assessment_5_sum_click | \n",
" assessment_6_sum_click | \n",
" assessment_1_score | \n",
" assessment_2_score | \n",
" assessment_3_score | \n",
" assessment_4_score | \n",
" assessment_5_score | \n",
" assessment_6_score | \n",
" final_exam_score | \n",
"
\n",
" \n",
" id_student | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 8462 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.75 | \n",
" 0.0 | \n",
" Withdrawn | \n",
" 340.0 | \n",
" 176.0 | \n",
" 90.0 | \n",
" 40.0 | \n",
" NaN | \n",
" NaN | \n",
" 93.0 | \n",
" 83.0 | \n",
" 87.0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 27417 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.25 | \n",
" 0.0 | \n",
" Withdrawn | \n",
" 43.0 | \n",
" 180.0 | \n",
" 103.0 | \n",
" 23.0 | \n",
" 52.0 | \n",
" 9.0 | \n",
" 48.0 | \n",
" 58.0 | \n",
" 52.0 | \n",
" 6.0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 27935 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.25 | \n",
" 0.0 | \n",
" Fail | \n",
" 301.0 | \n",
" 286.0 | \n",
" 61.0 | \n",
" 118.0 | \n",
" 47.0 | \n",
" NaN | \n",
" 75.0 | \n",
" 73.0 | \n",
" 90.0 | \n",
" 63.0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 28046 | \n",
" 0.5 | \n",
" 1.0 | \n",
" 0.75 | \n",
" 0.0 | \n",
" Fail | \n",
" 127.0 | \n",
" 131.0 | \n",
" 20.0 | \n",
" 45.0 | \n",
" 202.0 | \n",
" 53.0 | \n",
" 58.0 | \n",
" 57.0 | \n",
" 49.0 | \n",
" 49.0 | \n",
" 69.0 | \n",
" 24.0 | \n",
" 40.0 | \n",
"
\n",
" \n",
" 29411 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.50 | \n",
" 0.0 | \n",
" Pass | \n",
" 134.0 | \n",
" 78.0 | \n",
" 40.0 | \n",
" 59.0 | \n",
" 246.0 | \n",
" 57.0 | \n",
" 75.0 | \n",
" 79.0 | \n",
" 93.0 | \n",
" 58.0 | \n",
" 86.0 | \n",
" 66.0 | \n",
" 62.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 2692948 | \n",
" 0.5 | \n",
" 1.0 | \n",
" 0.25 | \n",
" 1.0 | \n",
" Pass | \n",
" 2.0 | \n",
" 28.0 | \n",
" 107.0 | \n",
" 59.0 | \n",
" 47.0 | \n",
" 113.0 | \n",
" NaN | \n",
" NaN | \n",
" 73.0 | \n",
" 50.0 | \n",
" 53.0 | \n",
" 41.0 | \n",
" 53.0 | \n",
"
\n",
" \n",
" 2694886 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.75 | \n",
" 0.0 | \n",
" Pass | \n",
" 210.0 | \n",
" 125.0 | \n",
" 43.0 | \n",
" 82.0 | \n",
" 224.0 | \n",
" 45.0 | \n",
" 71.0 | \n",
" 76.0 | \n",
" 85.0 | \n",
" 66.0 | \n",
" 64.0 | \n",
" 62.0 | \n",
" 69.0 | \n",
"
\n",
" \n",
" 2696376 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.25 | \n",
" 3.0 | \n",
" Fail | \n",
" NaN | \n",
" 40.0 | \n",
" 87.0 | \n",
" 1.0 | \n",
" 53.0 | \n",
" 92.0 | \n",
" NaN | \n",
" 56.0 | \n",
" 39.0 | \n",
" NaN | \n",
" 27.0 | \n",
" 26.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 2698251 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.50 | \n",
" 0.0 | \n",
" Fail | \n",
" 447.0 | \n",
" 158.0 | \n",
" 59.0 | \n",
" 257.0 | \n",
" 360.0 | \n",
" 115.0 | \n",
" 69.0 | \n",
" 76.0 | \n",
" 53.0 | \n",
" 67.0 | \n",
" 62.0 | \n",
" 36.0 | \n",
" 44.0 | \n",
"
\n",
" \n",
" 2710343 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.25 | \n",
" 1.0 | \n",
" Fail | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
2875 rows × 18 columns
\n",
"
"
],
"text/plain": [
" age_band gender highest_education num_of_prev_attempts \\\n",
"id_student \n",
"8462 1.0 0.0 0.75 0.0 \n",
"27417 0.0 0.0 0.25 0.0 \n",
"27935 0.0 1.0 0.25 0.0 \n",
"28046 0.5 1.0 0.75 0.0 \n",
"29411 0.0 0.0 0.50 0.0 \n",
"... ... ... ... ... \n",
"2692948 0.5 1.0 0.25 1.0 \n",
"2694886 0.0 0.0 0.75 0.0 \n",
"2696376 0.0 1.0 0.25 3.0 \n",
"2698251 0.0 1.0 0.50 0.0 \n",
"2710343 0.0 0.0 0.25 1.0 \n",
"\n",
" final_result assessment_1_sum_click assessment_2_sum_click \\\n",
"id_student \n",
"8462 Withdrawn 340.0 176.0 \n",
"27417 Withdrawn 43.0 180.0 \n",
"27935 Fail 301.0 286.0 \n",
"28046 Fail 127.0 131.0 \n",
"29411 Pass 134.0 78.0 \n",
"... ... ... ... \n",
"2692948 Pass 2.0 28.0 \n",
"2694886 Pass 210.0 125.0 \n",
"2696376 Fail NaN 40.0 \n",
"2698251 Fail 447.0 158.0 \n",
"2710343 Fail NaN NaN \n",
"\n",
" assessment_3_sum_click assessment_4_sum_click \\\n",
"id_student \n",
"8462 90.0 40.0 \n",
"27417 103.0 23.0 \n",
"27935 61.0 118.0 \n",
"28046 20.0 45.0 \n",
"29411 40.0 59.0 \n",
"... ... ... \n",
"2692948 107.0 59.0 \n",
"2694886 43.0 82.0 \n",
"2696376 87.0 1.0 \n",
"2698251 59.0 257.0 \n",
"2710343 NaN NaN \n",
"\n",
" assessment_5_sum_click assessment_6_sum_click \\\n",
"id_student \n",
"8462 NaN NaN \n",
"27417 52.0 9.0 \n",
"27935 47.0 NaN \n",
"28046 202.0 53.0 \n",
"29411 246.0 57.0 \n",
"... ... ... \n",
"2692948 47.0 113.0 \n",
"2694886 224.0 45.0 \n",
"2696376 53.0 92.0 \n",
"2698251 360.0 115.0 \n",
"2710343 NaN NaN \n",
"\n",
" assessment_1_score assessment_2_score assessment_3_score \\\n",
"id_student \n",
"8462 93.0 83.0 87.0 \n",
"27417 48.0 58.0 52.0 \n",
"27935 75.0 73.0 90.0 \n",
"28046 58.0 57.0 49.0 \n",
"29411 75.0 79.0 93.0 \n",
"... ... ... ... \n",
"2692948 NaN NaN 73.0 \n",
"2694886 71.0 76.0 85.0 \n",
"2696376 NaN 56.0 39.0 \n",
"2698251 69.0 76.0 53.0 \n",
"2710343 NaN NaN NaN \n",
"\n",
" assessment_4_score assessment_5_score assessment_6_score \\\n",
"id_student \n",
"8462 NaN NaN NaN \n",
"27417 6.0 NaN NaN \n",
"27935 63.0 NaN NaN \n",
"28046 49.0 69.0 24.0 \n",
"29411 58.0 86.0 66.0 \n",
"... ... ... ... \n",
"2692948 50.0 53.0 41.0 \n",
"2694886 66.0 64.0 62.0 \n",
"2696376 NaN 27.0 26.0 \n",
"2698251 67.0 62.0 36.0 \n",
"2710343 NaN NaN NaN \n",
"\n",
" final_exam_score \n",
"id_student \n",
"8462 NaN \n",
"27417 NaN \n",
"27935 NaN \n",
"28046 40.0 \n",
"29411 62.0 \n",
"... ... \n",
"2692948 53.0 \n",
"2694886 69.0 \n",
"2696376 NaN \n",
"2698251 44.0 \n",
"2710343 NaN \n",
"\n",
"[2875 rows x 18 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%capture -ns predicting_students_final_exam_outcome feature_table\n",
"\n",
"\n",
"def get_feature_table(max_date=500, code_presentation=\"2013J\"):\n",
" \"\"\"Returns the feature table computed from the OULAD dataset.\"\"\"\n",
" assessments = (\n",
" filter_by_module_presentation(oulad.assessments, \"DDD\", code_presentation)\n",
" # Filter out assessments that are after the max_date.\n",
" .query(f\"date <= {max_date} or assessment_type == 'Exam'\").set_index(\n",
" \"id_assessment\"\n",
" )\n",
" )\n",
" vle = (\n",
" filter_by_module_presentation(oulad.student_vle, \"DDD\", code_presentation)\n",
" .loc[:, [\"id_student\", \"date\", \"sum_click\"]]\n",
" # Categorize the date field by assessment date.\n",
" .assign(\n",
" date=lambda df: pd.cut(\n",
" df.date,\n",
" [-26] + assessments.date.values.tolist(),\n",
" labels=assessments.date.values,\n",
" )\n",
" )\n",
" # Sum scores by date.\n",
" .groupby([\"id_student\", \"date\"], observed=True)\n",
" .agg(\"sum\")\n",
" .reset_index()\n",
" # Reshape the vle table.\n",
" .pivot(index=\"id_student\", columns=\"date\", values=\"sum_click\")\n",
" # Rename columns\n",
" .rename(\n",
" columns={\n",
" assessment.date: (\n",
" f\"assessment_{i+1}_sum_click\"\n",
" if assessment.assessment_type != \"Exam\"\n",
" else \"final_exam_sum_click\"\n",
" )\n",
" for i, (_, assessment) in enumerate(assessments.iterrows())\n",
" }\n",
" )\n",
" .drop(\"final_exam_sum_click\", axis=1)\n",
" )\n",
" return (\n",
" filter_by_module_presentation(oulad.student_info, \"DDD\", code_presentation)\n",
" .loc[\n",
" :,\n",
" [\n",
" \"age_band\",\n",
" \"gender\",\n",
" \"id_student\",\n",
" \"highest_education\",\n",
" \"num_of_prev_attempts\",\n",
" \"final_result\",\n",
" ],\n",
" ]\n",
" # Transform gender, age_band and highest_education to numeric values.\n",
" .replace(\n",
" {\n",
" \"age_band\": {\"0-35\": \"0.0\", \"35-55\": \"0.5\", \"55<=\": \"1.0\"},\n",
" \"gender\": {\"M\": \"0.0\", \"F\": \"1.0\"},\n",
" \"highest_education\": {\n",
" \"No Formal quals\": \"0.0\",\n",
" \"Lower Than A Level\": \"0.25\",\n",
" \"A Level or Equivalent\": \"0.5\",\n",
" \"HE Qualification\": \"0.75\",\n",
" \"Post Graduate Qualification\": \"1.0\",\n",
" },\n",
" }\n",
" )\n",
" .astype(\n",
" {\n",
" \"age_band\": float,\n",
" \"gender\": float,\n",
" \"highest_education\": float,\n",
" \"num_of_prev_attempts\": float,\n",
" }\n",
" )\n",
" .set_index(\"id_student\")\n",
" # Filter out students who have unregistered from the course before the start.\n",
" .join(\n",
" filter_by_module_presentation(\n",
" oulad.student_registration, \"DDD\", code_presentation\n",
" )\n",
" .set_index(\"id_student\")\n",
" .query(\"not date_unregistration < 0\")\n",
" .loc[:, []],\n",
" how=\"right\",\n",
" )\n",
" .join(vle)\n",
" .join(\n",
" assessments.join(oulad.student_assessment.set_index(\"id_assessment\"))\n",
" .reset_index()\n",
" .pivot(index=\"id_student\", columns=\"id_assessment\", values=\"score\")\n",
" .rename(\n",
" columns={\n",
" id_assessment: (\n",
" f\"assessment_{i+1}_score\"\n",
" if assessment.assessment_type != \"Exam\"\n",
" else \"final_exam_score\"\n",
" )\n",
" for i, (id_assessment, assessment) in enumerate(\n",
" assessments.iterrows()\n",
" )\n",
" }\n",
" )\n",
" )\n",
" )\n",
"\n",
"\n",
"feature_table = pd.concat(\n",
" [get_feature_table(), get_feature_table(code_presentation=\"2014B\")]\n",
")\n",
"display(feature_table)"
]
},
{
"cell_type": "markdown",
"id": "100a9874",
"metadata": {},
"source": [
"### Pre-Processing\n",
"\n",
"#### Handling NAs\n",
"\n",
"We notice many missing values from the `final_exam_score` column in the selected\n",
"feature table."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "99415cb2",
"metadata": {
"lines_to_next_cell": 2
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The feature table has 2875 rows and the final exam score column has 1383 rows with NAs (48%).\n"
]
}
],
"source": [
"print(\n",
" f\"The feature table has {len(feature_table)} rows and the final exam score \"\n",
" f\"column has {feature_table.final_exam_score.isna().sum()} rows with NAs \"\n",
" f\"({100*feature_table.final_exam_score.isna().sum() / len(feature_table):.0f}%).\"\n",
")"
]
},
{
"cell_type": "markdown",
"id": "aab2e3d7",
"metadata": {
"lines_to_next_cell": 2
},
"source": [
"This is explained in the original OULAD paper of Kuzilek et al.\n",
"\\[[KHZ17](first_descriptive_analysis)\\]:\n",
"```\n",
"Results of the final exam are usually missing (since they are scored and used for the\n",
"final marking immediately at the end of the module).\n",
"```\n",
"\n",
"Therefore, we use the `final_results` column to fill in the missing final exam\n",
"values and then remove the `final_results` column.\n",
"\n",
"Other columns containing missing values we fill out with the value `-1`."
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "762fdbfd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age_band | \n",
" gender | \n",
" highest_education | \n",
" num_of_prev_attempts | \n",
" assessment_1_sum_click | \n",
" assessment_2_sum_click | \n",
" assessment_3_sum_click | \n",
" assessment_4_sum_click | \n",
" assessment_5_sum_click | \n",
" assessment_6_sum_click | \n",
" assessment_1_score | \n",
" assessment_2_score | \n",
" assessment_3_score | \n",
" assessment_4_score | \n",
" assessment_5_score | \n",
" assessment_6_score | \n",
" final_exam_score | \n",
"
\n",
" \n",
" id_student | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 8462 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.75 | \n",
" 0.0 | \n",
" 340.0 | \n",
" 176.0 | \n",
" 90.0 | \n",
" 40.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" 93.0 | \n",
" 83.0 | \n",
" 87.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 27417 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.25 | \n",
" 0.0 | \n",
" 43.0 | \n",
" 180.0 | \n",
" 103.0 | \n",
" 23.0 | \n",
" 52.0 | \n",
" 9.0 | \n",
" 48.0 | \n",
" 58.0 | \n",
" 52.0 | \n",
" 6.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 27935 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.25 | \n",
" 0.0 | \n",
" 301.0 | \n",
" 286.0 | \n",
" 61.0 | \n",
" 118.0 | \n",
" 47.0 | \n",
" -1.0 | \n",
" 75.0 | \n",
" 73.0 | \n",
" 90.0 | \n",
" 63.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 28046 | \n",
" 0.5 | \n",
" 1.0 | \n",
" 0.75 | \n",
" 0.0 | \n",
" 127.0 | \n",
" 131.0 | \n",
" 20.0 | \n",
" 45.0 | \n",
" 202.0 | \n",
" 53.0 | \n",
" 58.0 | \n",
" 57.0 | \n",
" 49.0 | \n",
" 49.0 | \n",
" 69.0 | \n",
" 24.0 | \n",
" 40.0 | \n",
"
\n",
" \n",
" 29411 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.50 | \n",
" 0.0 | \n",
" 134.0 | \n",
" 78.0 | \n",
" 40.0 | \n",
" 59.0 | \n",
" 246.0 | \n",
" 57.0 | \n",
" 75.0 | \n",
" 79.0 | \n",
" 93.0 | \n",
" 58.0 | \n",
" 86.0 | \n",
" 66.0 | \n",
" 62.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 2692948 | \n",
" 0.5 | \n",
" 1.0 | \n",
" 0.25 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 28.0 | \n",
" 107.0 | \n",
" 59.0 | \n",
" 47.0 | \n",
" 113.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" 73.0 | \n",
" 50.0 | \n",
" 53.0 | \n",
" 41.0 | \n",
" 53.0 | \n",
"
\n",
" \n",
" 2694886 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.75 | \n",
" 0.0 | \n",
" 210.0 | \n",
" 125.0 | \n",
" 43.0 | \n",
" 82.0 | \n",
" 224.0 | \n",
" 45.0 | \n",
" 71.0 | \n",
" 76.0 | \n",
" 85.0 | \n",
" 66.0 | \n",
" 64.0 | \n",
" 62.0 | \n",
" 69.0 | \n",
"
\n",
" \n",
" 2696376 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.25 | \n",
" 3.0 | \n",
" -1.0 | \n",
" 40.0 | \n",
" 87.0 | \n",
" 1.0 | \n",
" 53.0 | \n",
" 92.0 | \n",
" -1.0 | \n",
" 56.0 | \n",
" 39.0 | \n",
" -1.0 | \n",
" 27.0 | \n",
" 26.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2698251 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.50 | \n",
" 0.0 | \n",
" 447.0 | \n",
" 158.0 | \n",
" 59.0 | \n",
" 257.0 | \n",
" 360.0 | \n",
" 115.0 | \n",
" 69.0 | \n",
" 76.0 | \n",
" 53.0 | \n",
" 67.0 | \n",
" 62.0 | \n",
" 36.0 | \n",
" 44.0 | \n",
"
\n",
" \n",
" 2710343 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.25 | \n",
" 1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" -1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
2875 rows × 17 columns
\n",
"
"
],
"text/plain": [
" age_band gender highest_education num_of_prev_attempts \\\n",
"id_student \n",
"8462 1.0 0.0 0.75 0.0 \n",
"27417 0.0 0.0 0.25 0.0 \n",
"27935 0.0 1.0 0.25 0.0 \n",
"28046 0.5 1.0 0.75 0.0 \n",
"29411 0.0 0.0 0.50 0.0 \n",
"... ... ... ... ... \n",
"2692948 0.5 1.0 0.25 1.0 \n",
"2694886 0.0 0.0 0.75 0.0 \n",
"2696376 0.0 1.0 0.25 3.0 \n",
"2698251 0.0 1.0 0.50 0.0 \n",
"2710343 0.0 0.0 0.25 1.0 \n",
"\n",
" assessment_1_sum_click assessment_2_sum_click \\\n",
"id_student \n",
"8462 340.0 176.0 \n",
"27417 43.0 180.0 \n",
"27935 301.0 286.0 \n",
"28046 127.0 131.0 \n",
"29411 134.0 78.0 \n",
"... ... ... \n",
"2692948 2.0 28.0 \n",
"2694886 210.0 125.0 \n",
"2696376 -1.0 40.0 \n",
"2698251 447.0 158.0 \n",
"2710343 -1.0 -1.0 \n",
"\n",
" assessment_3_sum_click assessment_4_sum_click \\\n",
"id_student \n",
"8462 90.0 40.0 \n",
"27417 103.0 23.0 \n",
"27935 61.0 118.0 \n",
"28046 20.0 45.0 \n",
"29411 40.0 59.0 \n",
"... ... ... \n",
"2692948 107.0 59.0 \n",
"2694886 43.0 82.0 \n",
"2696376 87.0 1.0 \n",
"2698251 59.0 257.0 \n",
"2710343 -1.0 -1.0 \n",
"\n",
" assessment_5_sum_click assessment_6_sum_click \\\n",
"id_student \n",
"8462 -1.0 -1.0 \n",
"27417 52.0 9.0 \n",
"27935 47.0 -1.0 \n",
"28046 202.0 53.0 \n",
"29411 246.0 57.0 \n",
"... ... ... \n",
"2692948 47.0 113.0 \n",
"2694886 224.0 45.0 \n",
"2696376 53.0 92.0 \n",
"2698251 360.0 115.0 \n",
"2710343 -1.0 -1.0 \n",
"\n",
" assessment_1_score assessment_2_score assessment_3_score \\\n",
"id_student \n",
"8462 93.0 83.0 87.0 \n",
"27417 48.0 58.0 52.0 \n",
"27935 75.0 73.0 90.0 \n",
"28046 58.0 57.0 49.0 \n",
"29411 75.0 79.0 93.0 \n",
"... ... ... ... \n",
"2692948 -1.0 -1.0 73.0 \n",
"2694886 71.0 76.0 85.0 \n",
"2696376 -1.0 56.0 39.0 \n",
"2698251 69.0 76.0 53.0 \n",
"2710343 -1.0 -1.0 -1.0 \n",
"\n",
" assessment_4_score assessment_5_score assessment_6_score \\\n",
"id_student \n",
"8462 -1.0 -1.0 -1.0 \n",
"27417 6.0 -1.0 -1.0 \n",
"27935 63.0 -1.0 -1.0 \n",
"28046 49.0 69.0 24.0 \n",
"29411 58.0 86.0 66.0 \n",
"... ... ... ... \n",
"2692948 50.0 53.0 41.0 \n",
"2694886 66.0 64.0 62.0 \n",
"2696376 -1.0 27.0 26.0 \n",
"2698251 67.0 62.0 36.0 \n",
"2710343 -1.0 -1.0 -1.0 \n",
"\n",
" final_exam_score \n",
"id_student \n",
"8462 0.0 \n",
"27417 0.0 \n",
"27935 0.0 \n",
"28046 40.0 \n",
"29411 62.0 \n",
"... ... \n",
"2692948 53.0 \n",
"2694886 69.0 \n",
"2696376 0.0 \n",
"2698251 44.0 \n",
"2710343 0.0 \n",
"\n",
"[2875 rows x 17 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def fill_nas(feature_table_df):\n",
" \"\"\"Fills NAs in the `final_exam_score` column with `final_result` values,\n",
" drops the `final_result` column and fills remaining NAs with the value `-1`.\n",
" \"\"\"\n",
"\n",
" mask = feature_table_df.final_exam_score.isna()\n",
" feature_table_df.loc[mask, \"final_exam_score\"] = (\n",
" feature_table_df[mask].final_result.isin([\"Pass\", \"Distinction\"]) * 40\n",
" )\n",
" return feature_table_df.drop(columns=\"final_result\").fillna(-1)\n",
"\n",
"\n",
"feature_table = fill_nas(feature_table)\n",
"display(feature_table)"
]
},
{
"cell_type": "markdown",
"id": "7db6b252",
"metadata": {
"tags": []
},
"source": [
"#### Splitting train/test data and Normalization\n",
"\n",
"Now we randomly split the feature table rows into a train (80%) and test (20%) table\n",
"and, as in the work of Tomasevic et al., we scale and normalize the selected\n",
"features:\n",
"\n",
"```{list-table}\n",
":header-rows: 1\n",
"\n",
"* - Feature\n",
" - Normalization\n",
"\n",
"* - Gender\n",
" - 0 = male\n",
"\n",
" 1 = female\n",
"\n",
"* - Age band\n",
" - 0.0 = 0-35\n",
"\n",
" 0.5 = 35-55\n",
"\n",
" 1.0 = 55<=\n",
"\n",
"* - Highest education\n",
" - 0.00 = No Formal quals\n",
"\n",
" 0.25 = Lower Than A Level\n",
"\n",
" 0.50 = A Level or Equivalent\n",
"\n",
" 0.75 = HE Qualification\n",
"\n",
" 1.00 = Post Graduate Qualification\n",
"\n",
"* - Number of attempts\n",
"\n",
" Sum of clicks per assessment\n",
" - 0-N scaled to [0-1]\n",
"\n",
"* - Scores per assessment\n",
"\n",
" Final exam score\n",
" - 0-100 scaled to [0-1]\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "41ae6d8a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age_band | \n",
" gender | \n",
" highest_education | \n",
" num_of_prev_attempts | \n",
" assessment_1_sum_click | \n",
" assessment_2_sum_click | \n",
" assessment_3_sum_click | \n",
" assessment_4_sum_click | \n",
" assessment_5_sum_click | \n",
" assessment_6_sum_click | \n",
" assessment_1_score | \n",
" assessment_2_score | \n",
" assessment_3_score | \n",
" assessment_4_score | \n",
" assessment_5_score | \n",
" assessment_6_score | \n",
"
\n",
" \n",
" id_student | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 628806 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.50 | \n",
" 0.000000 | \n",
" 0.297634 | \n",
" 0.171275 | \n",
" 0.179245 | \n",
" 0.442890 | \n",
" 0.105741 | \n",
" 0.026979 | \n",
" 0.95 | \n",
" 0.95 | \n",
" 0.98 | \n",
" 0.97 | \n",
" -0.01 | \n",
" -0.01 | \n",
"
\n",
" \n",
" 572976 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.50 | \n",
" 0.000000 | \n",
" 0.036129 | \n",
" 0.012887 | \n",
" 0.052411 | \n",
" 0.062937 | \n",
" 0.038154 | \n",
" 0.008165 | \n",
" 0.68 | \n",
" 0.43 | \n",
" 0.45 | \n",
" 0.56 | \n",
" 0.64 | \n",
" -0.01 | \n",
"
\n",
" \n",
" 588566 | \n",
" 0.5 | \n",
" 0.0 | \n",
" 0.50 | \n",
" 0.000000 | \n",
" 0.059355 | \n",
" 0.043580 | \n",
" 0.188679 | \n",
" 0.163947 | \n",
" 0.148619 | \n",
" 0.035144 | \n",
" 0.75 | \n",
" 0.90 | \n",
" 0.83 | \n",
" 0.80 | \n",
" 0.66 | \n",
" 0.62 | \n",
"
\n",
" \n",
" 607555 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.75 | \n",
" 0.000000 | \n",
" 0.038280 | \n",
" 0.017338 | \n",
" 0.040881 | \n",
" 0.076923 | \n",
" 0.062863 | \n",
" 0.084487 | \n",
" 0.70 | \n",
" 0.54 | \n",
" 0.73 | \n",
" 0.37 | \n",
" 0.15 | \n",
" 0.41 | \n",
"
\n",
" \n",
" 381539 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.00 | \n",
" 0.166667 | \n",
" 0.038710 | \n",
" 0.005389 | \n",
" 0.018868 | \n",
" 0.062937 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.79 | \n",
" 0.43 | \n",
" 0.66 | \n",
" -0.01 | \n",
" -0.01 | \n",
" -0.01 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 552313 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.25 | \n",
" 0.000000 | \n",
" 0.076989 | \n",
" 0.037254 | \n",
" 0.104822 | \n",
" 0.177156 | \n",
" 0.134448 | \n",
" 0.051828 | \n",
" 0.87 | \n",
" 0.78 | \n",
" 0.93 | \n",
" 0.87 | \n",
" 0.88 | \n",
" 0.79 | \n",
"
\n",
" \n",
" 563770 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.25 | \n",
" 0.000000 | \n",
" 0.054624 | \n",
" 0.036082 | \n",
" 0.047170 | \n",
" 0.003885 | \n",
" 0.010538 | \n",
" 0.000710 | \n",
" 0.98 | \n",
" 0.91 | \n",
" 0.87 | \n",
" -0.01 | \n",
" 0.63 | \n",
" -0.01 | \n",
"
\n",
" \n",
" 2377769 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.50 | \n",
" 0.000000 | \n",
" 0.237849 | \n",
" 0.036082 | \n",
" 0.161426 | \n",
" 0.165501 | \n",
" 0.080669 | \n",
" 0.055023 | \n",
" 0.65 | \n",
" 0.84 | \n",
" 0.75 | \n",
" 0.42 | \n",
" 0.66 | \n",
" 0.61 | \n",
"
\n",
" \n",
" 629822 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.25 | \n",
" 0.000000 | \n",
" 0.080430 | \n",
" 0.014292 | \n",
" 0.019916 | \n",
" 0.013209 | \n",
" 0.020349 | \n",
" 0.009585 | \n",
" 0.90 | \n",
" 0.74 | \n",
" 0.79 | \n",
" 0.22 | \n",
" 0.63 | \n",
" 0.32 | \n",
"
\n",
" \n",
" 1332356 | \n",
" 0.5 | \n",
" 0.0 | \n",
" 0.75 | \n",
" 0.000000 | \n",
" 0.099785 | \n",
" 0.015230 | \n",
" 0.082809 | \n",
" 0.037296 | \n",
" 0.009811 | \n",
" 0.003195 | \n",
" 0.75 | \n",
" 0.87 | \n",
" 0.90 | \n",
" 0.76 | \n",
" -0.01 | \n",
" 0.67 | \n",
"
\n",
" \n",
"
\n",
"
2300 rows × 16 columns
\n",
"
"
],
"text/plain": [
" age_band gender highest_education num_of_prev_attempts \\\n",
"id_student \n",
"628806 0.0 0.0 0.50 0.000000 \n",
"572976 0.0 1.0 0.50 0.000000 \n",
"588566 0.5 0.0 0.50 0.000000 \n",
"607555 0.0 0.0 0.75 0.000000 \n",
"381539 0.0 0.0 0.00 0.166667 \n",
"... ... ... ... ... \n",
"552313 0.0 1.0 0.25 0.000000 \n",
"563770 0.0 0.0 0.25 0.000000 \n",
"2377769 0.0 1.0 0.50 0.000000 \n",
"629822 0.0 0.0 0.25 0.000000 \n",
"1332356 0.5 0.0 0.75 0.000000 \n",
"\n",
" assessment_1_sum_click assessment_2_sum_click \\\n",
"id_student \n",
"628806 0.297634 0.171275 \n",
"572976 0.036129 0.012887 \n",
"588566 0.059355 0.043580 \n",
"607555 0.038280 0.017338 \n",
"381539 0.038710 0.005389 \n",
"... ... ... \n",
"552313 0.076989 0.037254 \n",
"563770 0.054624 0.036082 \n",
"2377769 0.237849 0.036082 \n",
"629822 0.080430 0.014292 \n",
"1332356 0.099785 0.015230 \n",
"\n",
" assessment_3_sum_click assessment_4_sum_click \\\n",
"id_student \n",
"628806 0.179245 0.442890 \n",
"572976 0.052411 0.062937 \n",
"588566 0.188679 0.163947 \n",
"607555 0.040881 0.076923 \n",
"381539 0.018868 0.062937 \n",
"... ... ... \n",
"552313 0.104822 0.177156 \n",
"563770 0.047170 0.003885 \n",
"2377769 0.161426 0.165501 \n",
"629822 0.019916 0.013209 \n",
"1332356 0.082809 0.037296 \n",
"\n",
" assessment_5_sum_click assessment_6_sum_click \\\n",
"id_student \n",
"628806 0.105741 0.026979 \n",
"572976 0.038154 0.008165 \n",
"588566 0.148619 0.035144 \n",
"607555 0.062863 0.084487 \n",
"381539 0.000000 0.000000 \n",
"... ... ... \n",
"552313 0.134448 0.051828 \n",
"563770 0.010538 0.000710 \n",
"2377769 0.080669 0.055023 \n",
"629822 0.020349 0.009585 \n",
"1332356 0.009811 0.003195 \n",
"\n",
" assessment_1_score assessment_2_score assessment_3_score \\\n",
"id_student \n",
"628806 0.95 0.95 0.98 \n",
"572976 0.68 0.43 0.45 \n",
"588566 0.75 0.90 0.83 \n",
"607555 0.70 0.54 0.73 \n",
"381539 0.79 0.43 0.66 \n",
"... ... ... ... \n",
"552313 0.87 0.78 0.93 \n",
"563770 0.98 0.91 0.87 \n",
"2377769 0.65 0.84 0.75 \n",
"629822 0.90 0.74 0.79 \n",
"1332356 0.75 0.87 0.90 \n",
"\n",
" assessment_4_score assessment_5_score assessment_6_score \n",
"id_student \n",
"628806 0.97 -0.01 -0.01 \n",
"572976 0.56 0.64 -0.01 \n",
"588566 0.80 0.66 0.62 \n",
"607555 0.37 0.15 0.41 \n",
"381539 -0.01 -0.01 -0.01 \n",
"... ... ... ... \n",
"552313 0.87 0.88 0.79 \n",
"563770 -0.01 0.63 -0.01 \n",
"2377769 0.42 0.66 0.61 \n",
"629822 0.22 0.63 0.32 \n",
"1332356 0.76 -0.01 0.67 \n",
"\n",
"[2300 rows x 16 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"RANDOM_STATE = 0\n",
"\n",
"\n",
"def normalized_train_test_split(feature_table_df):\n",
" \"\"\"Returns the normalized train/test split computed from the feature table.\"\"\"\n",
" x_train_, x_test_, y_train_, y_test_ = train_test_split(\n",
" feature_table_df.drop(columns=\"final_exam_score\"),\n",
" feature_table_df[\"final_exam_score\"],\n",
" test_size=0.2,\n",
" random_state=RANDOM_STATE,\n",
" )\n",
" # Scale scores per assessment and final_exam_score.\n",
" assessment_score_labels = feature_table_df.columns.values[\n",
" feature_table_df.columns.str.match(r\"assessment_[0-9]+_score\")\n",
" ]\n",
" x_train_.loc[:, assessment_score_labels] /= 100\n",
" x_test_.loc[:, assessment_score_labels] /= 100\n",
" y_train_ = (y_train_ / 100 >= 0.4).astype(int)\n",
" y_test_ = (y_test_ / 100 >= 0.4).astype(int)\n",
"\n",
" # Scale the sum of clicks per assessment and number of attempts.\n",
" columns_slice = feature_table_df.columns.values[\n",
" feature_table_df.columns.str.match(r\"assessment_[0-9]+_sum_click\")\n",
" ].tolist() + [\"num_of_prev_attempts\"]\n",
"\n",
" # Note: we fit the scaler only on the train data to avoid leaking information\n",
" # from the test data.\n",
" scaler = MinMaxScaler().fit(x_train_.loc[:, columns_slice])\n",
" x_train_.loc[:, columns_slice] = scaler.transform(x_train_.loc[:, columns_slice])\n",
" x_test_.loc[:, columns_slice] = scaler.transform(x_test_.loc[:, columns_slice])\n",
" return (x_train_, x_test_, y_train_, y_test_)\n",
"\n",
"\n",
"x_train, x_test, y_train, y_test = normalized_train_test_split(feature_table)\n",
"display(x_train)"
]
},
{
"cell_type": "markdown",
"id": "40c21308",
"metadata": {},
"source": [
"## Classification\n",
"\n",
"As in the work of Tomasevic et al., we will compare the classification performances\n",
"for the student final exam pass prediction (score >= 40).\n",
"\n",
"We use the same models and try to perform a grid search over the same Hyper-parameter\n",
"ranges if these were specified in the paper:\n",
"\n",
"- K-Nearest Neighbours (with & without `weights`, varying `K` between 1 and 50)\n",
"- Support Vector Machines (with `linear` and `RBF` kernels, varying `C` in\n",
"`[0.1, 1.0, 10]`, varying gamma in `[0.0001, 0.01, 0.1]`)\n",
"- Artificial Neural Networks (with one and two hidden layers)\n",
"- Decision Trees (with varying `max depth`, `split` strategy and `quality measure`)\n",
"- Naïve Bayes (with varying `var_smoothing`)\n",
"- Logistic Regression (with `lbfgs` and `saga` solvers)\n",
"\n",
"And the performance metric used here is also the F1 score.\n",
"\n",
"As a reminder, the formula of the F1 score is:\n",
"2 * (precision * recall) / (precision + recall)\n",
"\n",
"However, in contrast to the paper, we use 5-fold cross validation during the grid\n",
"search phase."
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "8e4033e2",
"metadata": {
"lines_to_next_cell": 2
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" classifier | \n",
" score | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" KNeighborsClassifier | \n",
" 0.9097 | \n",
"
\n",
" \n",
" 1 | \n",
" SVC | \n",
" 0.9113 | \n",
"
\n",
" \n",
" 2 | \n",
" MLPClassifier | \n",
" 0.9014 | \n",
"
\n",
" \n",
" 3 | \n",
" DecisionTreeClassifier | \n",
" 0.9154 | \n",
"
\n",
" \n",
" 4 | \n",
" GaussianNB | \n",
" 0.8865 | \n",
"
\n",
" \n",
" 5 | \n",
" LogisticRegression | \n",
" 0.8917 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" classifier score\n",
"0 KNeighborsClassifier 0.9097\n",
"1 SVC 0.9113\n",
"2 MLPClassifier 0.9014\n",
"3 DecisionTreeClassifier 0.9154\n",
"4 GaussianNB 0.8865\n",
"5 LogisticRegression 0.8917"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%capture -ns predicting_students_final_exam_outcome gs_scores\n",
"# Hyperparameter search space\n",
"\n",
"classifier_hyperparameters = {\n",
" # K-Nearest Neighbours\n",
" KNeighborsClassifier: [\n",
" # {\"n_neighbors\": range(1, 51), \"weights\":[\"uniform\", \"distance\"]}\n",
" # We reduce search space for speed\n",
" {\n",
" \"n_neighbors\": [24],\n",
" \"weights\": [\"distance\"],\n",
" }\n",
" ],\n",
" # Support Vector Machines\n",
" SVC: [\n",
" # {\n",
" # \"kernel\": [\"linear\"],\n",
" # \"C\": [0.1, 1.0, 10],\n",
" # \"probability\": [True],\n",
" # \"random_state\": [RANDOM_STATE],\n",
" # },\n",
" {\n",
" \"kernel\": [\"rbf\"],\n",
" \"C\": [10], # [0.1, 1.0, 10],\n",
" \"gamma\": [\"scale\"], # [\"scale\", \"auto\", 0.0001, 0.01, 0.1],\n",
" \"probability\": [True],\n",
" \"random_state\": [RANDOM_STATE],\n",
" },\n",
" ],\n",
" # Artificial Neural Networks\n",
" MLPClassifier: [\n",
" {\n",
" \"max_iter\": [1000],\n",
" \"validation_fraction\": [0.2],\n",
" \"hidden_layer_sizes\": [(10,)], # [(10,), (20,), (52, 10)],\n",
" \"random_state\": [RANDOM_STATE],\n",
" # [(i,) for i in range(2, 100, 10)] + [\n",
" # (i, j) for i in range(2, 100, 10) for j in range(2, 100, 10)\n",
" # ],\n",
" # As we do not notice any improvement by varying `activation` and `alpha`,\n",
" # we choose to keep the default values for these parameters.\n",
" # \"activation\": [\"logistic\", \"tanh\", \"relu\"],\n",
" # \"alpha\": 10.0 ** (- np.arange(-1,6))\n",
" },\n",
" ],\n",
" # Decision Tree\n",
" DecisionTreeClassifier: [\n",
" {\n",
" \"criterion\": [\"entropy\"], # [\"gini\", \"entropy\"],\n",
" \"splitter\": [\"best\"], # [\"best\", \"random\"],\n",
" \"max_depth\": [6], # [None, *list(range(1, 11))],\n",
" \"min_samples_split\": [2], # range(2, 11, 2),\n",
" \"min_samples_leaf\": [10], # range(2, 11, 2),\n",
" \"random_state\": [RANDOM_STATE],\n",
" },\n",
" ],\n",
" # Naive Bayes\n",
" GaussianNB: [\n",
" {\n",
" \"var_smoothing\": [1e-9], # [1e-9, 1e-8, 1e-7, 1e-6]\n",
" }\n",
" ],\n",
" # Logistic Regression\n",
" LogisticRegression: [\n",
" {\n",
" \"solver\": [\"lbfgs\"], # [\"lbfgs\", \"saga\"],\n",
" \"random_state\": [RANDOM_STATE],\n",
" }\n",
" ],\n",
"}\n",
"\n",
"\n",
"def get_grid_search_scores():\n",
" \"\"\"Returns the grid search scores.\"\"\"\n",
" classifier_score = {\"classifier\": [], \"score\": []}\n",
" for classifier, hyperparameters in classifier_hyperparameters.items():\n",
" gs_classifier = GridSearchCV(\n",
" classifier(), hyperparameters, scoring=\"f1\", n_jobs=-1\n",
" )\n",
" gs_classifier.fit(x_train, y_train)\n",
" classifier_score[\"classifier\"].append(classifier.__name__)\n",
" classifier_score[\"score\"].append(gs_classifier.score(x_test, y_test))\n",
"\n",
" return classifier_score\n",
"\n",
"\n",
"gs_scores = pd.DataFrame(get_grid_search_scores()).round(4)\n",
"display(gs_scores)"
]
},
{
"cell_type": "markdown",
"id": "35a8543a",
"metadata": {},
"source": [
"### Classification at different points in time\n",
"\n",
"Predicting student final exam outcome seems to be more valuable at an early stage of\n",
"the course as it might give instuctors more time to help the students at risk.\n",
"However, predicting early is more challenging as less data is available for the\n",
"classifiers.\n",
"\n",
"As in the work of Tomasevic et al., we will compare the classification performances at\n",
"different moments of the course based on the number of assessments passed.\n",
"\n",
"Let's start by taking a look at the assessment table for the selected courses."
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "f7961bb5",
"metadata": {
"lines_to_next_cell": 2
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" code_module | \n",
" code_presentation | \n",
" id_assessment | \n",
" assessment_type | \n",
" date | \n",
" weight | \n",
"
\n",
" \n",
" \n",
" \n",
" 88 | \n",
" DDD | \n",
" 2013J | \n",
" 25348 | \n",
" TMA | \n",
" 25.0 | \n",
" 10.0 | \n",
"
\n",
" \n",
" 95 | \n",
" DDD | \n",
" 2014B | \n",
" 25355 | \n",
" TMA | \n",
" 25.0 | \n",
" 10.0 | \n",
"
\n",
" \n",
" 96 | \n",
" DDD | \n",
" 2014B | \n",
" 25356 | \n",
" TMA | \n",
" 53.0 | \n",
" 12.5 | \n",
"
\n",
" \n",
" 89 | \n",
" DDD | \n",
" 2013J | \n",
" 25349 | \n",
" TMA | \n",
" 53.0 | \n",
" 12.5 | \n",
"
\n",
" \n",
" 97 | \n",
" DDD | \n",
" 2014B | \n",
" 25357 | \n",
" TMA | \n",
" 74.0 | \n",
" 17.5 | \n",
"
\n",
" \n",
" 90 | \n",
" DDD | \n",
" 2013J | \n",
" 25350 | \n",
" TMA | \n",
" 88.0 | \n",
" 17.5 | \n",
"
\n",
" \n",
" 98 | \n",
" DDD | \n",
" 2014B | \n",
" 25358 | \n",
" TMA | \n",
" 116.0 | \n",
" 20.0 | \n",
"
\n",
" \n",
" 91 | \n",
" DDD | \n",
" 2013J | \n",
" 25351 | \n",
" TMA | \n",
" 123.0 | \n",
" 20.0 | \n",
"
\n",
" \n",
" 99 | \n",
" DDD | \n",
" 2014B | \n",
" 25359 | \n",
" TMA | \n",
" 158.0 | \n",
" 20.0 | \n",
"
\n",
" \n",
" 92 | \n",
" DDD | \n",
" 2013J | \n",
" 25352 | \n",
" TMA | \n",
" 165.0 | \n",
" 20.0 | \n",
"
\n",
" \n",
" 100 | \n",
" DDD | \n",
" 2014B | \n",
" 25360 | \n",
" TMA | \n",
" 200.0 | \n",
" 20.0 | \n",
"
\n",
" \n",
" 93 | \n",
" DDD | \n",
" 2013J | \n",
" 25353 | \n",
" TMA | \n",
" 207.0 | \n",
" 20.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" code_module code_presentation id_assessment assessment_type date \\\n",
"88 DDD 2013J 25348 TMA 25.0 \n",
"95 DDD 2014B 25355 TMA 25.0 \n",
"96 DDD 2014B 25356 TMA 53.0 \n",
"89 DDD 2013J 25349 TMA 53.0 \n",
"97 DDD 2014B 25357 TMA 74.0 \n",
"90 DDD 2013J 25350 TMA 88.0 \n",
"98 DDD 2014B 25358 TMA 116.0 \n",
"91 DDD 2013J 25351 TMA 123.0 \n",
"99 DDD 2014B 25359 TMA 158.0 \n",
"92 DDD 2013J 25352 TMA 165.0 \n",
"100 DDD 2014B 25360 TMA 200.0 \n",
"93 DDD 2013J 25353 TMA 207.0 \n",
"\n",
" weight \n",
"88 10.0 \n",
"95 10.0 \n",
"96 12.5 \n",
"89 12.5 \n",
"97 17.5 \n",
"90 17.5 \n",
"98 20.0 \n",
"91 20.0 \n",
"99 20.0 \n",
"92 20.0 \n",
"100 20.0 \n",
"93 20.0 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"oulad.assessments[\n",
" (oulad.assessments.code_module == \"DDD\")\n",
" & (oulad.assessments.assessment_type == \"TMA\")\n",
" & (\n",
" (oulad.assessments.code_presentation == \"2013J\")\n",
" | (oulad.assessments.code_presentation == \"2014B\")\n",
" )\n",
"].sort_values(\"date\")"
]
},
{
"cell_type": "markdown",
"id": "db721a00",
"metadata": {
"lines_to_next_cell": 2,
"tags": []
},
"source": [
"We note that each course module has six intermediary assessments.\n",
"\n",
"Next, we use the final submisssion `date` field to filter out assessment related\n",
"information after a given date and repeat the same data preprocessing and\n",
"classification process as done previously.\n",
"\n",
"We also add Voting and MultiCons ensemble methods to check whether they might improve\n",
"current results."
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "2cfd10c6",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"F1 score at different points in time:"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 25 | \n",
" 53 | \n",
" 88 | \n",
" 123 | \n",
" 165 | \n",
" 207 | \n",
"
\n",
" \n",
" \n",
" \n",
" KNeighborsClassifier | \n",
" 0.7063 | \n",
" 0.7758 | \n",
" 0.8325 | \n",
" 0.8800 | \n",
" 0.8969 | \n",
" 0.9097 | \n",
"
\n",
" \n",
" SVC | \n",
" 0.7219 | \n",
" 0.7968 | \n",
" 0.8271 | \n",
" 0.8767 | \n",
" 0.8935 | \n",
" 0.9113 | \n",
"
\n",
" \n",
" MLPClassifier | \n",
" 0.7374 | \n",
" 0.7927 | \n",
" 0.8358 | \n",
" 0.8711 | \n",
" 0.8838 | \n",
" 0.9014 | \n",
"
\n",
" \n",
" DecisionTreeClassifier | \n",
" 0.7143 | \n",
" 0.8007 | \n",
" 0.8285 | \n",
" 0.8547 | \n",
" 0.8889 | \n",
" 0.9154 | \n",
"
\n",
" \n",
" GaussianNB | \n",
" 0.7220 | \n",
" 0.7981 | \n",
" 0.8180 | \n",
" 0.8611 | \n",
" 0.8657 | \n",
" 0.8865 | \n",
"
\n",
" \n",
" LogisticRegression | \n",
" 0.7347 | \n",
" 0.7941 | \n",
" 0.8319 | \n",
" 0.8746 | \n",
" 0.8881 | \n",
" 0.8917 | \n",
"
\n",
" \n",
" Voting | \n",
" 0.7313 | \n",
" 0.8000 | \n",
" 0.8295 | \n",
" 0.8709 | \n",
" 0.8977 | \n",
" 0.9139 | \n",
"
\n",
" \n",
" MultiCons | \n",
" 0.7322 | \n",
" 0.7909 | \n",
" 0.8262 | \n",
" 0.8741 | \n",
" 0.8951 | \n",
" 0.9033 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 25 53 88 123 165 207\n",
"KNeighborsClassifier 0.7063 0.7758 0.8325 0.8800 0.8969 0.9097\n",
"SVC 0.7219 0.7968 0.8271 0.8767 0.8935 0.9113\n",
"MLPClassifier 0.7374 0.7927 0.8358 0.8711 0.8838 0.9014\n",
"DecisionTreeClassifier 0.7143 0.8007 0.8285 0.8547 0.8889 0.9154\n",
"GaussianNB 0.7220 0.7981 0.8180 0.8611 0.8657 0.8865\n",
"LogisticRegression 0.7347 0.7941 0.8319 0.8746 0.8881 0.8917\n",
"Voting 0.7313 0.8000 0.8295 0.8709 0.8977 0.9139\n",
"MultiCons 0.7322 0.7909 0.8262 0.8741 0.8951 0.9033"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%capture -ns predicting_students_final_exam_outcome scores\n",
"def get_train_test_assessments_by_day(day):\n",
" \"\"\"Returns the train/test feature table filtered by date.\"\"\"\n",
"\n",
" filtered_feature_table = pd.concat(\n",
" [get_feature_table(day), get_feature_table(day, code_presentation=\"2014B\")]\n",
" )\n",
" filtered_feature_table = fill_nas(filtered_feature_table)\n",
" return normalized_train_test_split(filtered_feature_table)\n",
"\n",
"\n",
"def get_scores_by_assessment_date():\n",
" \"\"\"Returns a DataFrame with f1 prediction scores for each classifier.\"\"\"\n",
" # pylint: disable=too-many-locals\n",
" result = {}\n",
" # We select the date such that both courses include the same amount of assessments\n",
" # after the filter.\n",
" for day in [25, 53, 88, 123, 165, 207]:\n",
" result[day] = []\n",
" x_train_, x_test_, y_train_, y_test_ = get_train_test_assessments_by_day(day)\n",
" train_predictions = []\n",
" predictions = []\n",
" estimators = []\n",
" for classifier, hyperparameters in classifier_hyperparameters.items():\n",
" gs_classifier = GridSearchCV(\n",
" classifier(), hyperparameters, scoring=\"f1\", n_jobs=-1\n",
" )\n",
" gs_classifier.fit(x_train_, y_train_)\n",
" estimators.append((classifier.__name__, gs_classifier))\n",
" predictions.append(gs_classifier.predict(x_test_))\n",
" train_predictions.append(gs_classifier.predict(x_train_))\n",
" result[day].append(round(f1_score(y_test_, predictions[-1]), 4))\n",
"\n",
" # Voting Classifier\n",
" voting = VotingClassifier(estimators=estimators, voting=\"soft\")\n",
" voting.fit(x_train_, y_train_)\n",
" result[day].append(round(f1_score(y_test_, voting.predict(x_test_)), 4))\n",
"\n",
" # Searching for the best merging_threshold.\n",
" max_score = 0\n",
" multicons = None\n",
" for merging_threshold in np.arange(0, 1, 0.05):\n",
" consensus = MultiCons(\n",
" similarity_measure=\"JaccardIndex\",\n",
" optimize_label_names=True,\n",
" consensus_function=\"consensus_function_12\",\n",
" merging_threshold=merging_threshold,\n",
" ).fit(train_predictions)\n",
" score = f1_score(y_train_, consensus.labels_.astype(bool))\n",
" if score > max_score:\n",
" max_score = score\n",
" multicons = consensus\n",
"\n",
" result[day].append(\n",
" round(f1_score(y_test_, multicons.fit(predictions).labels_.astype(bool)), 4)\n",
" )\n",
"\n",
" return pd.DataFrame(\n",
" result,\n",
" index=[clf.__name__ for clf in classifier_hyperparameters]\n",
" + [\"Voting\", \"MultiCons\"],\n",
" )\n",
"\n",
"\n",
"scores = get_scores_by_assessment_date()\n",
"display(Markdown(\"F1 score at different points in time:\"))\n",
"display(scores)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}