Predicting student’s final exam outcome#

This section aims to predict the student final exam outcome (Pass (score >= 40) / Fail (score < 40)). We try to replicate the machine learning analysis techniques from the work of Tomasevic et al. (2020) [TGV20].

Keywords: Predicting student outcome

[TGV20]

Nikola Tomasevic, Nikola Gvozdenovic, and Sanja Vranes. An overview and comparison of supervised data mining techniques for student exam performance prediction. Computers & Education, 143:103676, 2020. URL: https://www.sciencedirect.com/science/article/pii/S0360131519302295, doi:https://doi.org/10.1016/j.compedu.2019.103676.

import numpy as np
import pandas as pd
from IPython.display import Markdown, display
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from multicons import MultiCons

from oulad import filter_by_module_presentation, get_oulad

%load_ext oulad.capture
%%capture oulad
oulad = get_oulad()

Preparing train/test data#

Selecting features#

In the work of Tomasevic et al. the student data from the DDD module of the 2013J and 2014B presentations combined is used.

Similarly, we try to select the same seven distinct attributes from the three distinct types below:

DEMOGRAPHIC

ENGAGEMENT

PERFORMANCE

- gender

- sum of clicks per assessment

- scores per assessment

- highest_education

- number of attempts

- age_band

- final_exam score

%%capture -ns predicting_students_final_exam_outcome feature_table


def get_feature_table(max_date=500, code_presentation="2013J"):
    """Returns the feature table computed from the OULAD dataset."""
    assessments = (
        filter_by_module_presentation(oulad.assessments, "DDD", code_presentation)
        # Filter out assessments that are after the max_date.
        .query(f"date <= {max_date} or assessment_type == 'Exam'").set_index(
            "id_assessment"
        )
    )
    vle = (
        filter_by_module_presentation(oulad.student_vle, "DDD", code_presentation)
        .loc[:, ["id_student", "date", "sum_click"]]
        # Categorize the date field by assessment date.
        .assign(
            date=lambda df: pd.cut(
                df.date,
                [-26] + assessments.date.values.tolist(),
                labels=assessments.date.values,
            )
        )
        # Sum scores by date.
        .groupby(["id_student", "date"], observed=True)
        .agg("sum")
        .reset_index()
        # Reshape the vle table.
        .pivot(index="id_student", columns="date", values="sum_click")
        # Rename columns
        .rename(
            columns={
                assessment.date: (
                    f"assessment_{i+1}_sum_click"
                    if assessment.assessment_type != "Exam"
                    else "final_exam_sum_click"
                )
                for i, (_, assessment) in enumerate(assessments.iterrows())
            }
        )
        .drop("final_exam_sum_click", axis=1)
    )
    return (
        filter_by_module_presentation(oulad.student_info, "DDD", code_presentation)
        .loc[
            :,
            [
                "age_band",
                "gender",
                "id_student",
                "highest_education",
                "num_of_prev_attempts",
                "final_result",
            ],
        ]
        # Transform gender, age_band and highest_education to numeric values.
        .replace(
            {
                "age_band": {"0-35": "0.0", "35-55": "0.5", "55<=": "1.0"},
                "gender": {"M": "0.0", "F": "1.0"},
                "highest_education": {
                    "No Formal quals": "0.0",
                    "Lower Than A Level": "0.25",
                    "A Level or Equivalent": "0.5",
                    "HE Qualification": "0.75",
                    "Post Graduate Qualification": "1.0",
                },
            }
        )
        .astype(
            {
                "age_band": float,
                "gender": float,
                "highest_education": float,
                "num_of_prev_attempts": float,
            }
        )
        .set_index("id_student")
        # Filter out students who have unregistered from the course before the start.
        .join(
            filter_by_module_presentation(
                oulad.student_registration, "DDD", code_presentation
            )
            .set_index("id_student")
            .query("not date_unregistration < 0")
            .loc[:, []],
            how="right",
        )
        .join(vle)
        .join(
            assessments.join(oulad.student_assessment.set_index("id_assessment"))
            .reset_index()
            .pivot(index="id_student", columns="id_assessment", values="score")
            .rename(
                columns={
                    id_assessment: (
                        f"assessment_{i+1}_score"
                        if assessment.assessment_type != "Exam"
                        else "final_exam_score"
                    )
                    for i, (id_assessment, assessment) in enumerate(
                        assessments.iterrows()
                    )
                }
            )
        )
    )


feature_table = pd.concat(
    [get_feature_table(), get_feature_table(code_presentation="2014B")]
)
display(feature_table)
age_band gender highest_education num_of_prev_attempts final_result assessment_1_sum_click assessment_2_sum_click assessment_3_sum_click assessment_4_sum_click assessment_5_sum_click assessment_6_sum_click assessment_1_score assessment_2_score assessment_3_score assessment_4_score assessment_5_score assessment_6_score final_exam_score
id_student
8462 1.0 0.0 0.75 0.0 Withdrawn 340.0 176.0 90.0 40.0 NaN NaN 93.0 83.0 87.0 NaN NaN NaN NaN
27417 0.0 0.0 0.25 0.0 Withdrawn 43.0 180.0 103.0 23.0 52.0 9.0 48.0 58.0 52.0 6.0 NaN NaN NaN
27935 0.0 1.0 0.25 0.0 Fail 301.0 286.0 61.0 118.0 47.0 NaN 75.0 73.0 90.0 63.0 NaN NaN NaN
28046 0.5 1.0 0.75 0.0 Fail 127.0 131.0 20.0 45.0 202.0 53.0 58.0 57.0 49.0 49.0 69.0 24.0 40.0
29411 0.0 0.0 0.50 0.0 Pass 134.0 78.0 40.0 59.0 246.0 57.0 75.0 79.0 93.0 58.0 86.0 66.0 62.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2692948 0.5 1.0 0.25 1.0 Pass 2.0 28.0 107.0 59.0 47.0 113.0 NaN NaN 73.0 50.0 53.0 41.0 53.0
2694886 0.0 0.0 0.75 0.0 Pass 210.0 125.0 43.0 82.0 224.0 45.0 71.0 76.0 85.0 66.0 64.0 62.0 69.0
2696376 0.0 1.0 0.25 3.0 Fail NaN 40.0 87.0 1.0 53.0 92.0 NaN 56.0 39.0 NaN 27.0 26.0 NaN
2698251 0.0 1.0 0.50 0.0 Fail 447.0 158.0 59.0 257.0 360.0 115.0 69.0 76.0 53.0 67.0 62.0 36.0 44.0
2710343 0.0 0.0 0.25 1.0 Fail NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

2875 rows × 18 columns

Pre-Processing#

Handling NAs#

We notice many missing values from the final_exam_score column in the selected feature table.

print(
    f"The feature table has {len(feature_table)} rows and the final exam score "
    f"column has {feature_table.final_exam_score.isna().sum()} rows with NAs "
    f"({100*feature_table.final_exam_score.isna().sum() / len(feature_table):.0f}%)."
)
The feature table has 2875 rows and the final exam score column has 1383 rows with NAs (48%).

This is explained in the original OULAD paper of Kuzilek et al. [KHZ17]:

Results of the final exam are usually missing (since they are scored and used for the
final marking immediately at the end of the module).

Therefore, we use the final_results column to fill in the missing final exam values and then remove the final_results column.

Other columns containing missing values we fill out with the value -1.

def fill_nas(feature_table_df):
    """Fills NAs in the `final_exam_score` column with `final_result` values,
    drops the `final_result` column and fills remaining NAs with the value `-1`.
    """

    mask = feature_table_df.final_exam_score.isna()
    feature_table_df.loc[mask, "final_exam_score"] = (
        feature_table_df[mask].final_result.isin(["Pass", "Distinction"]) * 40
    )
    return feature_table_df.drop(columns="final_result").fillna(-1)


feature_table = fill_nas(feature_table)
display(feature_table)
age_band gender highest_education num_of_prev_attempts assessment_1_sum_click assessment_2_sum_click assessment_3_sum_click assessment_4_sum_click assessment_5_sum_click assessment_6_sum_click assessment_1_score assessment_2_score assessment_3_score assessment_4_score assessment_5_score assessment_6_score final_exam_score
id_student
8462 1.0 0.0 0.75 0.0 340.0 176.0 90.0 40.0 -1.0 -1.0 93.0 83.0 87.0 -1.0 -1.0 -1.0 0.0
27417 0.0 0.0 0.25 0.0 43.0 180.0 103.0 23.0 52.0 9.0 48.0 58.0 52.0 6.0 -1.0 -1.0 0.0
27935 0.0 1.0 0.25 0.0 301.0 286.0 61.0 118.0 47.0 -1.0 75.0 73.0 90.0 63.0 -1.0 -1.0 0.0
28046 0.5 1.0 0.75 0.0 127.0 131.0 20.0 45.0 202.0 53.0 58.0 57.0 49.0 49.0 69.0 24.0 40.0
29411 0.0 0.0 0.50 0.0 134.0 78.0 40.0 59.0 246.0 57.0 75.0 79.0 93.0 58.0 86.0 66.0 62.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2692948 0.5 1.0 0.25 1.0 2.0 28.0 107.0 59.0 47.0 113.0 -1.0 -1.0 73.0 50.0 53.0 41.0 53.0
2694886 0.0 0.0 0.75 0.0 210.0 125.0 43.0 82.0 224.0 45.0 71.0 76.0 85.0 66.0 64.0 62.0 69.0
2696376 0.0 1.0 0.25 3.0 -1.0 40.0 87.0 1.0 53.0 92.0 -1.0 56.0 39.0 -1.0 27.0 26.0 0.0
2698251 0.0 1.0 0.50 0.0 447.0 158.0 59.0 257.0 360.0 115.0 69.0 76.0 53.0 67.0 62.0 36.0 44.0
2710343 0.0 0.0 0.25 1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 0.0

2875 rows × 17 columns

Splitting train/test data and Normalization#

Now we randomly split the feature table rows into a train (80%) and test (20%) table and, as in the work of Tomasevic et al., we scale and normalize the selected features:

Feature

Normalization

Gender

0 = male

1 = female

Age band

0.0 = 0-35

0.5 = 35-55

1.0 = 55<=

Highest education

0.00 = No Formal quals

0.25 = Lower Than A Level

0.50 = A Level or Equivalent

0.75 = HE Qualification

1.00 = Post Graduate Qualification

Number of attempts

Sum of clicks per assessment

0-N scaled to [0-1]

Scores per assessment

Final exam score

0-100 scaled to [0-1]

RANDOM_STATE = 0


def normalized_train_test_split(feature_table_df):
    """Returns the normalized train/test split computed from the feature table."""
    x_train_, x_test_, y_train_, y_test_ = train_test_split(
        feature_table_df.drop(columns="final_exam_score"),
        feature_table_df["final_exam_score"],
        test_size=0.2,
        random_state=RANDOM_STATE,
    )
    # Scale scores per assessment and final_exam_score.
    assessment_score_labels = feature_table_df.columns.values[
        feature_table_df.columns.str.match(r"assessment_[0-9]+_score")
    ]
    x_train_.loc[:, assessment_score_labels] /= 100
    x_test_.loc[:, assessment_score_labels] /= 100
    y_train_ = (y_train_ / 100 >= 0.4).astype(int)
    y_test_ = (y_test_ / 100 >= 0.4).astype(int)

    # Scale the sum of clicks per assessment and number of attempts.
    columns_slice = feature_table_df.columns.values[
        feature_table_df.columns.str.match(r"assessment_[0-9]+_sum_click")
    ].tolist() + ["num_of_prev_attempts"]

    # Note: we fit the scaler only on the train data to avoid leaking information
    # from the test data.
    scaler = MinMaxScaler().fit(x_train_.loc[:, columns_slice])
    x_train_.loc[:, columns_slice] = scaler.transform(x_train_.loc[:, columns_slice])
    x_test_.loc[:, columns_slice] = scaler.transform(x_test_.loc[:, columns_slice])
    return (x_train_, x_test_, y_train_, y_test_)


x_train, x_test, y_train, y_test = normalized_train_test_split(feature_table)
display(x_train)
age_band gender highest_education num_of_prev_attempts assessment_1_sum_click assessment_2_sum_click assessment_3_sum_click assessment_4_sum_click assessment_5_sum_click assessment_6_sum_click assessment_1_score assessment_2_score assessment_3_score assessment_4_score assessment_5_score assessment_6_score
id_student
628806 0.0 0.0 0.50 0.000000 0.297634 0.171275 0.179245 0.442890 0.105741 0.026979 0.95 0.95 0.98 0.97 -0.01 -0.01
572976 0.0 1.0 0.50 0.000000 0.036129 0.012887 0.052411 0.062937 0.038154 0.008165 0.68 0.43 0.45 0.56 0.64 -0.01
588566 0.5 0.0 0.50 0.000000 0.059355 0.043580 0.188679 0.163947 0.148619 0.035144 0.75 0.90 0.83 0.80 0.66 0.62
607555 0.0 0.0 0.75 0.000000 0.038280 0.017338 0.040881 0.076923 0.062863 0.084487 0.70 0.54 0.73 0.37 0.15 0.41
381539 0.0 0.0 0.00 0.166667 0.038710 0.005389 0.018868 0.062937 0.000000 0.000000 0.79 0.43 0.66 -0.01 -0.01 -0.01
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
552313 0.0 1.0 0.25 0.000000 0.076989 0.037254 0.104822 0.177156 0.134448 0.051828 0.87 0.78 0.93 0.87 0.88 0.79
563770 0.0 0.0 0.25 0.000000 0.054624 0.036082 0.047170 0.003885 0.010538 0.000710 0.98 0.91 0.87 -0.01 0.63 -0.01
2377769 0.0 1.0 0.50 0.000000 0.237849 0.036082 0.161426 0.165501 0.080669 0.055023 0.65 0.84 0.75 0.42 0.66 0.61
629822 0.0 0.0 0.25 0.000000 0.080430 0.014292 0.019916 0.013209 0.020349 0.009585 0.90 0.74 0.79 0.22 0.63 0.32
1332356 0.5 0.0 0.75 0.000000 0.099785 0.015230 0.082809 0.037296 0.009811 0.003195 0.75 0.87 0.90 0.76 -0.01 0.67

2300 rows × 16 columns

Classification#

As in the work of Tomasevic et al., we will compare the classification performances for the student final exam pass prediction (score >= 40).

We use the same models and try to perform a grid search over the same Hyper-parameter ranges if these were specified in the paper:

  • K-Nearest Neighbours (with & without weights, varying K between 1 and 50)

  • Support Vector Machines (with linear and RBF kernels, varying C in [0.1, 1.0, 10], varying gamma in [0.0001, 0.01, 0.1])

  • Artificial Neural Networks (with one and two hidden layers)

  • Decision Trees (with varying max depth, split strategy and quality measure)

  • Naïve Bayes (with varying var_smoothing)

  • Logistic Regression (with lbfgs and saga solvers)

And the performance metric used here is also the F1 score.

As a reminder, the formula of the F1 score is: 2 * (precision * recall) / (precision + recall)

However, in contrast to the paper, we use 5-fold cross validation during the grid search phase.

%%capture -ns predicting_students_final_exam_outcome gs_scores
# Hyperparameter search space

classifier_hyperparameters = {
    # K-Nearest Neighbours
    KNeighborsClassifier: [
        # {"n_neighbors": range(1, 51), "weights":["uniform", "distance"]}
        # We reduce search space for speed
        {
            "n_neighbors": [24],
            "weights": ["distance"],
        }
    ],
    # Support Vector Machines
    SVC: [
        # {
        #     "kernel": ["linear"],
        #     "C": [0.1, 1.0, 10],
        #     "probability": [True],
        #     "random_state": [RANDOM_STATE],
        # },
        {
            "kernel": ["rbf"],
            "C": [10],  # [0.1, 1.0, 10],
            "gamma": ["scale"],  # ["scale", "auto", 0.0001, 0.01, 0.1],
            "probability": [True],
            "random_state": [RANDOM_STATE],
        },
    ],
    # Artificial Neural Networks
    MLPClassifier: [
        {
            "max_iter": [1000],
            "validation_fraction": [0.2],
            "hidden_layer_sizes": [(10,)],  # [(10,), (20,), (52, 10)],
            "random_state": [RANDOM_STATE],
            # [(i,) for i in range(2, 100, 10)] + [
            #     (i, j) for i in range(2, 100, 10) for j in range(2, 100, 10)
            # ],
            # As we do not notice any improvement by varying `activation` and `alpha`,
            # we choose to keep the default values for these parameters.
            # "activation": ["logistic", "tanh", "relu"],
            # "alpha": 10.0 ** (- np.arange(-1,6))
        },
    ],
    # Decision Tree
    DecisionTreeClassifier: [
        {
            "criterion": ["entropy"],  # ["gini", "entropy"],
            "splitter": ["best"],  # ["best", "random"],
            "max_depth": [6],  # [None, *list(range(1, 11))],
            "min_samples_split": [2],  # range(2, 11, 2),
            "min_samples_leaf": [10],  # range(2, 11, 2),
            "random_state": [RANDOM_STATE],
        },
    ],
    # Naive Bayes
    GaussianNB: [
        {
            "var_smoothing": [1e-9],  # [1e-9, 1e-8, 1e-7, 1e-6]
        }
    ],
    # Logistic Regression
    LogisticRegression: [
        {
            "solver": ["lbfgs"],  # ["lbfgs", "saga"],
            "random_state": [RANDOM_STATE],
        }
    ],
}


def get_grid_search_scores():
    """Returns the grid search scores."""
    classifier_score = {"classifier": [], "score": []}
    for classifier, hyperparameters in classifier_hyperparameters.items():
        gs_classifier = GridSearchCV(
            classifier(), hyperparameters, scoring="f1", n_jobs=-1
        )
        gs_classifier.fit(x_train, y_train)
        classifier_score["classifier"].append(classifier.__name__)
        classifier_score["score"].append(gs_classifier.score(x_test, y_test))

    return classifier_score


gs_scores = pd.DataFrame(get_grid_search_scores()).round(4)
display(gs_scores)
classifier score
0 KNeighborsClassifier 0.9097
1 SVC 0.9113
2 MLPClassifier 0.9014
3 DecisionTreeClassifier 0.9154
4 GaussianNB 0.8865
5 LogisticRegression 0.8917

Classification at different points in time#

Predicting student final exam outcome seems to be more valuable at an early stage of the course as it might give instuctors more time to help the students at risk. However, predicting early is more challenging as less data is available for the classifiers.

As in the work of Tomasevic et al., we will compare the classification performances at different moments of the course based on the number of assessments passed.

Let’s start by taking a look at the assessment table for the selected courses.

oulad.assessments[
    (oulad.assessments.code_module == "DDD")
    & (oulad.assessments.assessment_type == "TMA")
    & (
        (oulad.assessments.code_presentation == "2013J")
        | (oulad.assessments.code_presentation == "2014B")
    )
].sort_values("date")
code_module code_presentation id_assessment assessment_type date weight
88 DDD 2013J 25348 TMA 25.0 10.0
95 DDD 2014B 25355 TMA 25.0 10.0
96 DDD 2014B 25356 TMA 53.0 12.5
89 DDD 2013J 25349 TMA 53.0 12.5
97 DDD 2014B 25357 TMA 74.0 17.5
90 DDD 2013J 25350 TMA 88.0 17.5
98 DDD 2014B 25358 TMA 116.0 20.0
91 DDD 2013J 25351 TMA 123.0 20.0
99 DDD 2014B 25359 TMA 158.0 20.0
92 DDD 2013J 25352 TMA 165.0 20.0
100 DDD 2014B 25360 TMA 200.0 20.0
93 DDD 2013J 25353 TMA 207.0 20.0

We note that each course module has six intermediary assessments.

Next, we use the final submisssion date field to filter out assessment related information after a given date and repeat the same data preprocessing and classification process as done previously.

We also add Voting and MultiCons ensemble methods to check whether they might improve current results.

%%capture -ns predicting_students_final_exam_outcome scores
def get_train_test_assessments_by_day(day):
    """Returns the train/test feature table filtered by date."""

    filtered_feature_table = pd.concat(
        [get_feature_table(day), get_feature_table(day, code_presentation="2014B")]
    )
    filtered_feature_table = fill_nas(filtered_feature_table)
    return normalized_train_test_split(filtered_feature_table)


def get_scores_by_assessment_date():
    """Returns a DataFrame with f1 prediction scores for each classifier."""
    # pylint: disable=too-many-locals
    result = {}
    # We select the date such that both courses include the same amount of assessments
    # after the filter.
    for day in [25, 53, 88, 123, 165, 207]:
        result[day] = []
        x_train_, x_test_, y_train_, y_test_ = get_train_test_assessments_by_day(day)
        train_predictions = []
        predictions = []
        estimators = []
        for classifier, hyperparameters in classifier_hyperparameters.items():
            gs_classifier = GridSearchCV(
                classifier(), hyperparameters, scoring="f1", n_jobs=-1
            )
            gs_classifier.fit(x_train_, y_train_)
            estimators.append((classifier.__name__, gs_classifier))
            predictions.append(gs_classifier.predict(x_test_))
            train_predictions.append(gs_classifier.predict(x_train_))
            result[day].append(round(f1_score(y_test_, predictions[-1]), 4))

        # Voting Classifier
        voting = VotingClassifier(estimators=estimators, voting="soft")
        voting.fit(x_train_, y_train_)
        result[day].append(round(f1_score(y_test_, voting.predict(x_test_)), 4))

        # Searching for the best merging_threshold.
        max_score = 0
        multicons = None
        for merging_threshold in np.arange(0, 1, 0.05):
            consensus = MultiCons(
                similarity_measure="JaccardIndex",
                optimize_label_names=True,
                consensus_function="consensus_function_12",
                merging_threshold=merging_threshold,
            ).fit(train_predictions)
            score = f1_score(y_train_, consensus.labels_.astype(bool))
            if score > max_score:
                max_score = score
                multicons = consensus

        result[day].append(
            round(f1_score(y_test_, multicons.fit(predictions).labels_.astype(bool)), 4)
        )

    return pd.DataFrame(
        result,
        index=[clf.__name__ for clf in classifier_hyperparameters]
        + ["Voting", "MultiCons"],
    )


scores = get_scores_by_assessment_date()
display(Markdown("F1 score at different points in time:"))
display(scores)

F1 score at different points in time:

25 53 88 123 165 207
KNeighborsClassifier 0.7063 0.7758 0.8325 0.8800 0.8969 0.9097
SVC 0.7219 0.7968 0.8271 0.8767 0.8935 0.9113
MLPClassifier 0.7374 0.7927 0.8358 0.8711 0.8838 0.9014
DecisionTreeClassifier 0.7143 0.8007 0.8285 0.8547 0.8889 0.9154
GaussianNB 0.7220 0.7981 0.8180 0.8611 0.8657 0.8865
LogisticRegression 0.7347 0.7941 0.8319 0.8746 0.8881 0.8917
Voting 0.7313 0.8000 0.8295 0.8709 0.8977 0.9139
MultiCons 0.7322 0.7909 0.8262 0.8741 0.8951 0.9033