Predicting student’s final exam outcome#
This section aims to predict the student final exam outcome
(Pass (score >= 40) / Fail (score < 40)).
We try to replicate the machine learning analysis techniques from the work of
Tomasevic et al. (2020) [TGV20].
Keywords: Predicting student outcome
Nikola Tomasevic, Nikola Gvozdenovic, and Sanja Vranes. An overview and comparison of supervised data mining techniques for student exam performance prediction. Computers & Education, 143:103676, 2020. URL: https://www.sciencedirect.com/science/article/pii/S0360131519302295, doi:https://doi.org/10.1016/j.compedu.2019.103676.
import numpy as np
import pandas as pd
from IPython.display import Markdown, display
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from multicons import MultiCons
from oulad import filter_by_module_presentation, get_oulad
%load_ext oulad.capture
%%capture oulad
oulad = get_oulad()
Preparing train/test data#
Selecting features#
In the work of Tomasevic et al. the student data from the DDD
module of the
2013J
and 2014B
presentations combined is used.
Similarly, we try to select the same seven distinct attributes from the three distinct types below:
DEMOGRAPHIC |
ENGAGEMENT |
PERFORMANCE |
---|---|---|
- gender |
- sum of clicks per assessment |
- scores per assessment |
- highest_education |
- number of attempts |
|
- age_band |
- final_exam score |
%%capture -ns predicting_students_final_exam_outcome feature_table
def get_feature_table(max_date=500, code_presentation="2013J"):
"""Returns the feature table computed from the OULAD dataset."""
assessments = (
filter_by_module_presentation(oulad.assessments, "DDD", code_presentation)
# Filter out assessments that are after the max_date.
.query(f"date <= {max_date} or assessment_type == 'Exam'").set_index(
"id_assessment"
)
)
vle = (
filter_by_module_presentation(oulad.student_vle, "DDD", code_presentation)
.loc[:, ["id_student", "date", "sum_click"]]
# Categorize the date field by assessment date.
.assign(
date=lambda df: pd.cut(
df.date,
[-26] + assessments.date.values.tolist(),
labels=assessments.date.values,
)
)
# Sum scores by date.
.groupby(["id_student", "date"], observed=True)
.agg("sum")
.reset_index()
# Reshape the vle table.
.pivot(index="id_student", columns="date", values="sum_click")
# Rename columns
.rename(
columns={
assessment.date: (
f"assessment_{i+1}_sum_click"
if assessment.assessment_type != "Exam"
else "final_exam_sum_click"
)
for i, (_, assessment) in enumerate(assessments.iterrows())
}
)
.drop("final_exam_sum_click", axis=1)
)
return (
filter_by_module_presentation(oulad.student_info, "DDD", code_presentation)
.loc[
:,
[
"age_band",
"gender",
"id_student",
"highest_education",
"num_of_prev_attempts",
"final_result",
],
]
# Transform gender, age_band and highest_education to numeric values.
.replace(
{
"age_band": {"0-35": "0.0", "35-55": "0.5", "55<=": "1.0"},
"gender": {"M": "0.0", "F": "1.0"},
"highest_education": {
"No Formal quals": "0.0",
"Lower Than A Level": "0.25",
"A Level or Equivalent": "0.5",
"HE Qualification": "0.75",
"Post Graduate Qualification": "1.0",
},
}
)
.astype(
{
"age_band": float,
"gender": float,
"highest_education": float,
"num_of_prev_attempts": float,
}
)
.set_index("id_student")
# Filter out students who have unregistered from the course before the start.
.join(
filter_by_module_presentation(
oulad.student_registration, "DDD", code_presentation
)
.set_index("id_student")
.query("not date_unregistration < 0")
.loc[:, []],
how="right",
)
.join(vle)
.join(
assessments.join(oulad.student_assessment.set_index("id_assessment"))
.reset_index()
.pivot(index="id_student", columns="id_assessment", values="score")
.rename(
columns={
id_assessment: (
f"assessment_{i+1}_score"
if assessment.assessment_type != "Exam"
else "final_exam_score"
)
for i, (id_assessment, assessment) in enumerate(
assessments.iterrows()
)
}
)
)
)
feature_table = pd.concat(
[get_feature_table(), get_feature_table(code_presentation="2014B")]
)
display(feature_table)
age_band | gender | highest_education | num_of_prev_attempts | final_result | assessment_1_sum_click | assessment_2_sum_click | assessment_3_sum_click | assessment_4_sum_click | assessment_5_sum_click | assessment_6_sum_click | assessment_1_score | assessment_2_score | assessment_3_score | assessment_4_score | assessment_5_score | assessment_6_score | final_exam_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id_student | ||||||||||||||||||
8462 | 1.0 | 0.0 | 0.75 | 0.0 | Withdrawn | 340.0 | 176.0 | 90.0 | 40.0 | NaN | NaN | 93.0 | 83.0 | 87.0 | NaN | NaN | NaN | NaN |
27417 | 0.0 | 0.0 | 0.25 | 0.0 | Withdrawn | 43.0 | 180.0 | 103.0 | 23.0 | 52.0 | 9.0 | 48.0 | 58.0 | 52.0 | 6.0 | NaN | NaN | NaN |
27935 | 0.0 | 1.0 | 0.25 | 0.0 | Fail | 301.0 | 286.0 | 61.0 | 118.0 | 47.0 | NaN | 75.0 | 73.0 | 90.0 | 63.0 | NaN | NaN | NaN |
28046 | 0.5 | 1.0 | 0.75 | 0.0 | Fail | 127.0 | 131.0 | 20.0 | 45.0 | 202.0 | 53.0 | 58.0 | 57.0 | 49.0 | 49.0 | 69.0 | 24.0 | 40.0 |
29411 | 0.0 | 0.0 | 0.50 | 0.0 | Pass | 134.0 | 78.0 | 40.0 | 59.0 | 246.0 | 57.0 | 75.0 | 79.0 | 93.0 | 58.0 | 86.0 | 66.0 | 62.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2692948 | 0.5 | 1.0 | 0.25 | 1.0 | Pass | 2.0 | 28.0 | 107.0 | 59.0 | 47.0 | 113.0 | NaN | NaN | 73.0 | 50.0 | 53.0 | 41.0 | 53.0 |
2694886 | 0.0 | 0.0 | 0.75 | 0.0 | Pass | 210.0 | 125.0 | 43.0 | 82.0 | 224.0 | 45.0 | 71.0 | 76.0 | 85.0 | 66.0 | 64.0 | 62.0 | 69.0 |
2696376 | 0.0 | 1.0 | 0.25 | 3.0 | Fail | NaN | 40.0 | 87.0 | 1.0 | 53.0 | 92.0 | NaN | 56.0 | 39.0 | NaN | 27.0 | 26.0 | NaN |
2698251 | 0.0 | 1.0 | 0.50 | 0.0 | Fail | 447.0 | 158.0 | 59.0 | 257.0 | 360.0 | 115.0 | 69.0 | 76.0 | 53.0 | 67.0 | 62.0 | 36.0 | 44.0 |
2710343 | 0.0 | 0.0 | 0.25 | 1.0 | Fail | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2875 rows × 18 columns
Pre-Processing#
Handling NAs#
We notice many missing values from the final_exam_score
column in the selected
feature table.
print(
f"The feature table has {len(feature_table)} rows and the final exam score "
f"column has {feature_table.final_exam_score.isna().sum()} rows with NAs "
f"({100*feature_table.final_exam_score.isna().sum() / len(feature_table):.0f}%)."
)
The feature table has 2875 rows and the final exam score column has 1383 rows with NAs (48%).
This is explained in the original OULAD paper of Kuzilek et al. [KHZ17]:
Results of the final exam are usually missing (since they are scored and used for the
final marking immediately at the end of the module).
Therefore, we use the final_results
column to fill in the missing final exam
values and then remove the final_results
column.
Other columns containing missing values we fill out with the value -1
.
def fill_nas(feature_table_df):
"""Fills NAs in the `final_exam_score` column with `final_result` values,
drops the `final_result` column and fills remaining NAs with the value `-1`.
"""
mask = feature_table_df.final_exam_score.isna()
feature_table_df.loc[mask, "final_exam_score"] = (
feature_table_df[mask].final_result.isin(["Pass", "Distinction"]) * 40
)
return feature_table_df.drop(columns="final_result").fillna(-1)
feature_table = fill_nas(feature_table)
display(feature_table)
age_band | gender | highest_education | num_of_prev_attempts | assessment_1_sum_click | assessment_2_sum_click | assessment_3_sum_click | assessment_4_sum_click | assessment_5_sum_click | assessment_6_sum_click | assessment_1_score | assessment_2_score | assessment_3_score | assessment_4_score | assessment_5_score | assessment_6_score | final_exam_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id_student | |||||||||||||||||
8462 | 1.0 | 0.0 | 0.75 | 0.0 | 340.0 | 176.0 | 90.0 | 40.0 | -1.0 | -1.0 | 93.0 | 83.0 | 87.0 | -1.0 | -1.0 | -1.0 | 0.0 |
27417 | 0.0 | 0.0 | 0.25 | 0.0 | 43.0 | 180.0 | 103.0 | 23.0 | 52.0 | 9.0 | 48.0 | 58.0 | 52.0 | 6.0 | -1.0 | -1.0 | 0.0 |
27935 | 0.0 | 1.0 | 0.25 | 0.0 | 301.0 | 286.0 | 61.0 | 118.0 | 47.0 | -1.0 | 75.0 | 73.0 | 90.0 | 63.0 | -1.0 | -1.0 | 0.0 |
28046 | 0.5 | 1.0 | 0.75 | 0.0 | 127.0 | 131.0 | 20.0 | 45.0 | 202.0 | 53.0 | 58.0 | 57.0 | 49.0 | 49.0 | 69.0 | 24.0 | 40.0 |
29411 | 0.0 | 0.0 | 0.50 | 0.0 | 134.0 | 78.0 | 40.0 | 59.0 | 246.0 | 57.0 | 75.0 | 79.0 | 93.0 | 58.0 | 86.0 | 66.0 | 62.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2692948 | 0.5 | 1.0 | 0.25 | 1.0 | 2.0 | 28.0 | 107.0 | 59.0 | 47.0 | 113.0 | -1.0 | -1.0 | 73.0 | 50.0 | 53.0 | 41.0 | 53.0 |
2694886 | 0.0 | 0.0 | 0.75 | 0.0 | 210.0 | 125.0 | 43.0 | 82.0 | 224.0 | 45.0 | 71.0 | 76.0 | 85.0 | 66.0 | 64.0 | 62.0 | 69.0 |
2696376 | 0.0 | 1.0 | 0.25 | 3.0 | -1.0 | 40.0 | 87.0 | 1.0 | 53.0 | 92.0 | -1.0 | 56.0 | 39.0 | -1.0 | 27.0 | 26.0 | 0.0 |
2698251 | 0.0 | 1.0 | 0.50 | 0.0 | 447.0 | 158.0 | 59.0 | 257.0 | 360.0 | 115.0 | 69.0 | 76.0 | 53.0 | 67.0 | 62.0 | 36.0 | 44.0 |
2710343 | 0.0 | 0.0 | 0.25 | 1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | 0.0 |
2875 rows × 17 columns
Splitting train/test data and Normalization#
Now we randomly split the feature table rows into a train (80%) and test (20%) table and, as in the work of Tomasevic et al., we scale and normalize the selected features:
Feature |
Normalization |
---|---|
Gender |
0 = male 1 = female |
Age band |
0.0 = 0-35 0.5 = 35-55 1.0 = 55<= |
Highest education |
0.00 = No Formal quals 0.25 = Lower Than A Level 0.50 = A Level or Equivalent 0.75 = HE Qualification 1.00 = Post Graduate Qualification |
Number of attempts Sum of clicks per assessment |
0-N scaled to [0-1] |
Scores per assessment Final exam score |
0-100 scaled to [0-1] |
RANDOM_STATE = 0
def normalized_train_test_split(feature_table_df):
"""Returns the normalized train/test split computed from the feature table."""
x_train_, x_test_, y_train_, y_test_ = train_test_split(
feature_table_df.drop(columns="final_exam_score"),
feature_table_df["final_exam_score"],
test_size=0.2,
random_state=RANDOM_STATE,
)
# Scale scores per assessment and final_exam_score.
assessment_score_labels = feature_table_df.columns.values[
feature_table_df.columns.str.match(r"assessment_[0-9]+_score")
]
x_train_.loc[:, assessment_score_labels] /= 100
x_test_.loc[:, assessment_score_labels] /= 100
y_train_ = (y_train_ / 100 >= 0.4).astype(int)
y_test_ = (y_test_ / 100 >= 0.4).astype(int)
# Scale the sum of clicks per assessment and number of attempts.
columns_slice = feature_table_df.columns.values[
feature_table_df.columns.str.match(r"assessment_[0-9]+_sum_click")
].tolist() + ["num_of_prev_attempts"]
# Note: we fit the scaler only on the train data to avoid leaking information
# from the test data.
scaler = MinMaxScaler().fit(x_train_.loc[:, columns_slice])
x_train_.loc[:, columns_slice] = scaler.transform(x_train_.loc[:, columns_slice])
x_test_.loc[:, columns_slice] = scaler.transform(x_test_.loc[:, columns_slice])
return (x_train_, x_test_, y_train_, y_test_)
x_train, x_test, y_train, y_test = normalized_train_test_split(feature_table)
display(x_train)
age_band | gender | highest_education | num_of_prev_attempts | assessment_1_sum_click | assessment_2_sum_click | assessment_3_sum_click | assessment_4_sum_click | assessment_5_sum_click | assessment_6_sum_click | assessment_1_score | assessment_2_score | assessment_3_score | assessment_4_score | assessment_5_score | assessment_6_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id_student | ||||||||||||||||
628806 | 0.0 | 0.0 | 0.50 | 0.000000 | 0.297634 | 0.171275 | 0.179245 | 0.442890 | 0.105741 | 0.026979 | 0.95 | 0.95 | 0.98 | 0.97 | -0.01 | -0.01 |
572976 | 0.0 | 1.0 | 0.50 | 0.000000 | 0.036129 | 0.012887 | 0.052411 | 0.062937 | 0.038154 | 0.008165 | 0.68 | 0.43 | 0.45 | 0.56 | 0.64 | -0.01 |
588566 | 0.5 | 0.0 | 0.50 | 0.000000 | 0.059355 | 0.043580 | 0.188679 | 0.163947 | 0.148619 | 0.035144 | 0.75 | 0.90 | 0.83 | 0.80 | 0.66 | 0.62 |
607555 | 0.0 | 0.0 | 0.75 | 0.000000 | 0.038280 | 0.017338 | 0.040881 | 0.076923 | 0.062863 | 0.084487 | 0.70 | 0.54 | 0.73 | 0.37 | 0.15 | 0.41 |
381539 | 0.0 | 0.0 | 0.00 | 0.166667 | 0.038710 | 0.005389 | 0.018868 | 0.062937 | 0.000000 | 0.000000 | 0.79 | 0.43 | 0.66 | -0.01 | -0.01 | -0.01 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
552313 | 0.0 | 1.0 | 0.25 | 0.000000 | 0.076989 | 0.037254 | 0.104822 | 0.177156 | 0.134448 | 0.051828 | 0.87 | 0.78 | 0.93 | 0.87 | 0.88 | 0.79 |
563770 | 0.0 | 0.0 | 0.25 | 0.000000 | 0.054624 | 0.036082 | 0.047170 | 0.003885 | 0.010538 | 0.000710 | 0.98 | 0.91 | 0.87 | -0.01 | 0.63 | -0.01 |
2377769 | 0.0 | 1.0 | 0.50 | 0.000000 | 0.237849 | 0.036082 | 0.161426 | 0.165501 | 0.080669 | 0.055023 | 0.65 | 0.84 | 0.75 | 0.42 | 0.66 | 0.61 |
629822 | 0.0 | 0.0 | 0.25 | 0.000000 | 0.080430 | 0.014292 | 0.019916 | 0.013209 | 0.020349 | 0.009585 | 0.90 | 0.74 | 0.79 | 0.22 | 0.63 | 0.32 |
1332356 | 0.5 | 0.0 | 0.75 | 0.000000 | 0.099785 | 0.015230 | 0.082809 | 0.037296 | 0.009811 | 0.003195 | 0.75 | 0.87 | 0.90 | 0.76 | -0.01 | 0.67 |
2300 rows × 16 columns
Classification#
As in the work of Tomasevic et al., we will compare the classification performances for the student final exam pass prediction (score >= 40).
We use the same models and try to perform a grid search over the same Hyper-parameter ranges if these were specified in the paper:
K-Nearest Neighbours (with & without
weights
, varyingK
between 1 and 50)Support Vector Machines (with
linear
andRBF
kernels, varyingC
in[0.1, 1.0, 10]
, varying gamma in[0.0001, 0.01, 0.1]
)Artificial Neural Networks (with one and two hidden layers)
Decision Trees (with varying
max depth
,split
strategy andquality measure
)Naïve Bayes (with varying
var_smoothing
)Logistic Regression (with
lbfgs
andsaga
solvers)
And the performance metric used here is also the F1 score.
As a reminder, the formula of the F1 score is: 2 * (precision * recall) / (precision + recall)
However, in contrast to the paper, we use 5-fold cross validation during the grid search phase.
%%capture -ns predicting_students_final_exam_outcome gs_scores
# Hyperparameter search space
classifier_hyperparameters = {
# K-Nearest Neighbours
KNeighborsClassifier: [
# {"n_neighbors": range(1, 51), "weights":["uniform", "distance"]}
# We reduce search space for speed
{
"n_neighbors": [24],
"weights": ["distance"],
}
],
# Support Vector Machines
SVC: [
# {
# "kernel": ["linear"],
# "C": [0.1, 1.0, 10],
# "probability": [True],
# "random_state": [RANDOM_STATE],
# },
{
"kernel": ["rbf"],
"C": [10], # [0.1, 1.0, 10],
"gamma": ["scale"], # ["scale", "auto", 0.0001, 0.01, 0.1],
"probability": [True],
"random_state": [RANDOM_STATE],
},
],
# Artificial Neural Networks
MLPClassifier: [
{
"max_iter": [1000],
"validation_fraction": [0.2],
"hidden_layer_sizes": [(10,)], # [(10,), (20,), (52, 10)],
"random_state": [RANDOM_STATE],
# [(i,) for i in range(2, 100, 10)] + [
# (i, j) for i in range(2, 100, 10) for j in range(2, 100, 10)
# ],
# As we do not notice any improvement by varying `activation` and `alpha`,
# we choose to keep the default values for these parameters.
# "activation": ["logistic", "tanh", "relu"],
# "alpha": 10.0 ** (- np.arange(-1,6))
},
],
# Decision Tree
DecisionTreeClassifier: [
{
"criterion": ["entropy"], # ["gini", "entropy"],
"splitter": ["best"], # ["best", "random"],
"max_depth": [6], # [None, *list(range(1, 11))],
"min_samples_split": [2], # range(2, 11, 2),
"min_samples_leaf": [10], # range(2, 11, 2),
"random_state": [RANDOM_STATE],
},
],
# Naive Bayes
GaussianNB: [
{
"var_smoothing": [1e-9], # [1e-9, 1e-8, 1e-7, 1e-6]
}
],
# Logistic Regression
LogisticRegression: [
{
"solver": ["lbfgs"], # ["lbfgs", "saga"],
"random_state": [RANDOM_STATE],
}
],
}
def get_grid_search_scores():
"""Returns the grid search scores."""
classifier_score = {"classifier": [], "score": []}
for classifier, hyperparameters in classifier_hyperparameters.items():
gs_classifier = GridSearchCV(
classifier(), hyperparameters, scoring="f1", n_jobs=-1
)
gs_classifier.fit(x_train, y_train)
classifier_score["classifier"].append(classifier.__name__)
classifier_score["score"].append(gs_classifier.score(x_test, y_test))
return classifier_score
gs_scores = pd.DataFrame(get_grid_search_scores()).round(4)
display(gs_scores)
classifier | score | |
---|---|---|
0 | KNeighborsClassifier | 0.9097 |
1 | SVC | 0.9113 |
2 | MLPClassifier | 0.9014 |
3 | DecisionTreeClassifier | 0.9154 |
4 | GaussianNB | 0.8865 |
5 | LogisticRegression | 0.8917 |
Classification at different points in time#
Predicting student final exam outcome seems to be more valuable at an early stage of the course as it might give instuctors more time to help the students at risk. However, predicting early is more challenging as less data is available for the classifiers.
As in the work of Tomasevic et al., we will compare the classification performances at different moments of the course based on the number of assessments passed.
Let’s start by taking a look at the assessment table for the selected courses.
oulad.assessments[
(oulad.assessments.code_module == "DDD")
& (oulad.assessments.assessment_type == "TMA")
& (
(oulad.assessments.code_presentation == "2013J")
| (oulad.assessments.code_presentation == "2014B")
)
].sort_values("date")
code_module | code_presentation | id_assessment | assessment_type | date | weight | |
---|---|---|---|---|---|---|
88 | DDD | 2013J | 25348 | TMA | 25.0 | 10.0 |
95 | DDD | 2014B | 25355 | TMA | 25.0 | 10.0 |
96 | DDD | 2014B | 25356 | TMA | 53.0 | 12.5 |
89 | DDD | 2013J | 25349 | TMA | 53.0 | 12.5 |
97 | DDD | 2014B | 25357 | TMA | 74.0 | 17.5 |
90 | DDD | 2013J | 25350 | TMA | 88.0 | 17.5 |
98 | DDD | 2014B | 25358 | TMA | 116.0 | 20.0 |
91 | DDD | 2013J | 25351 | TMA | 123.0 | 20.0 |
99 | DDD | 2014B | 25359 | TMA | 158.0 | 20.0 |
92 | DDD | 2013J | 25352 | TMA | 165.0 | 20.0 |
100 | DDD | 2014B | 25360 | TMA | 200.0 | 20.0 |
93 | DDD | 2013J | 25353 | TMA | 207.0 | 20.0 |
We note that each course module has six intermediary assessments.
Next, we use the final submisssion date
field to filter out assessment related
information after a given date and repeat the same data preprocessing and
classification process as done previously.
We also add Voting and MultiCons ensemble methods to check whether they might improve current results.
%%capture -ns predicting_students_final_exam_outcome scores
def get_train_test_assessments_by_day(day):
"""Returns the train/test feature table filtered by date."""
filtered_feature_table = pd.concat(
[get_feature_table(day), get_feature_table(day, code_presentation="2014B")]
)
filtered_feature_table = fill_nas(filtered_feature_table)
return normalized_train_test_split(filtered_feature_table)
def get_scores_by_assessment_date():
"""Returns a DataFrame with f1 prediction scores for each classifier."""
# pylint: disable=too-many-locals
result = {}
# We select the date such that both courses include the same amount of assessments
# after the filter.
for day in [25, 53, 88, 123, 165, 207]:
result[day] = []
x_train_, x_test_, y_train_, y_test_ = get_train_test_assessments_by_day(day)
train_predictions = []
predictions = []
estimators = []
for classifier, hyperparameters in classifier_hyperparameters.items():
gs_classifier = GridSearchCV(
classifier(), hyperparameters, scoring="f1", n_jobs=-1
)
gs_classifier.fit(x_train_, y_train_)
estimators.append((classifier.__name__, gs_classifier))
predictions.append(gs_classifier.predict(x_test_))
train_predictions.append(gs_classifier.predict(x_train_))
result[day].append(round(f1_score(y_test_, predictions[-1]), 4))
# Voting Classifier
voting = VotingClassifier(estimators=estimators, voting="soft")
voting.fit(x_train_, y_train_)
result[day].append(round(f1_score(y_test_, voting.predict(x_test_)), 4))
# Searching for the best merging_threshold.
max_score = 0
multicons = None
for merging_threshold in np.arange(0, 1, 0.05):
consensus = MultiCons(
similarity_measure="JaccardIndex",
optimize_label_names=True,
consensus_function="consensus_function_12",
merging_threshold=merging_threshold,
).fit(train_predictions)
score = f1_score(y_train_, consensus.labels_.astype(bool))
if score > max_score:
max_score = score
multicons = consensus
result[day].append(
round(f1_score(y_test_, multicons.fit(predictions).labels_.astype(bool)), 4)
)
return pd.DataFrame(
result,
index=[clf.__name__ for clf in classifier_hyperparameters]
+ ["Voting", "MultiCons"],
)
scores = get_scores_by_assessment_date()
display(Markdown("F1 score at different points in time:"))
display(scores)
F1 score at different points in time:
25 | 53 | 88 | 123 | 165 | 207 | |
---|---|---|---|---|---|---|
KNeighborsClassifier | 0.7063 | 0.7758 | 0.8325 | 0.8800 | 0.8969 | 0.9097 |
SVC | 0.7219 | 0.7968 | 0.8271 | 0.8767 | 0.8935 | 0.9113 |
MLPClassifier | 0.7374 | 0.7927 | 0.8358 | 0.8711 | 0.8838 | 0.9014 |
DecisionTreeClassifier | 0.7143 | 0.8007 | 0.8285 | 0.8547 | 0.8889 | 0.9154 |
GaussianNB | 0.7220 | 0.7981 | 0.8180 | 0.8611 | 0.8657 | 0.8865 |
LogisticRegression | 0.7347 | 0.7941 | 0.8319 | 0.8746 | 0.8881 | 0.8917 |
Voting | 0.7313 | 0.8000 | 0.8295 | 0.8709 | 0.8977 | 0.9139 |
MultiCons | 0.7322 | 0.7909 | 0.8262 | 0.8741 | 0.8951 | 0.9033 |