In [20]:
# import the .py file which stores all our functions and library imports 
from ThematicTextClassify.TextClassifier import *
from ThematicTextClassify.Preprocessing import *

Consumption Classification

This notebook will use the results of the best text classifiers tuned with GridsearchCV from the Classifiers folder to produce classification results

  • First method used would be to combine the classification results of multple tuned classifiers by merging the classification results on new data and dropping its duplicates
  • Second approach would be using two types of ensemble methods: Stacking and Voting to produce one classifer that classifies our new data
In [21]:
df =pd.read_csv('Categorized_Links.csv')
# read the training data 
# Add title and description column together to form a text column as our document
df['Text'] = df['Title']+ df['Description']
df = df.dropna(subset= ['Text'], axis = 0)

# preprocess the newly defined Text column
# preprocess_text is a function written and saved in TextClassifier.py

df['Processed Text'] = df['Text'].map(preprocess_text)

# turn list to string format
df['processed_string'] =  [' '.join(text) for text in df['Processed Text']]

df['Class'] = ""
df['Class'] = df.apply(lambda df: 'Consumption' if (df['Category'] == 'Consumption') else df['Class'], axis =1)
df['Class'] = df.apply(lambda df: 'Other' if (df['Category'] != 'Consumption') else df['Class'], axis =1)

# drop duplicates we want consumption all at the top of the data set
# therefore we can drop duplicates by the keeping the first one we see 
df = df.reset_index(drop=True)
df = df.sort_values('Class')
df = df.reset_index(drop=True)
df = df.drop_duplicates(['Link'],keep= 'first')
df = df.reset_index(drop=True)
len(df)
Out[21]:
301
In [22]:
# split data set using train_test_split from sklearn 
text_train, text_test, class_train, class_test = train_test_split(df,
                                                    df['Class'],
                                                    test_size=0.20, 
                                                    random_state=509)

Combining Classifier Predictions

We have already tuned the classifier with optmized parameters, we will then combine classification results on the new data and drop the duplicates

  • Multinomial Naive Bayes (CountVectorizer/tfidf)
  • Logistic Regression (CountVectorizer/tfidf)
  • Linear SVC (CountVectorizer/tfidf)
  • RandomForestClassifier(CountVectorizer/tfidf)
  • XGBoost (CountVectorizer/tfidf)

This method is not such a great way since it introduces a lot of error in our classification results, but lets just test it out and see

Load the data to be classified

In [23]:
# Load data to be classified 
full = pd.read_csv("NewData.csv")

# combine the title and description as text as our document
full['Text'] = full['Title'] +full['Description']
full['Processed Text'] = full['Text'].map(preprocess_text)
full['processed_string'] =  [' '.join(text) for text in full['Processed Text']]
full = full.reset_index(drop=True)

Classify New Data with the 5 Classifiers with optimized tuning parameters

In [24]:
# MultinomialNB (CountVect)
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.25, min_df =1, ngram_range = (2,3)),MultinomialNB(alpha=0.25), text_train, class_train, full )
model1 = full[full['Class'] == 'Consumption']

# MultinomialNB (TFIDF)
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.25, min_df =1, ngram_range = (2,3)),MultinomialNB(alpha=0.5), text_train, class_train, full)
model2 = full[full['Class'] == 'Consumption']

#  LogisticRegression (CountVect)
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.5, min_df =1, ngram_range = (1,3)), LogisticRegression(C=1.0, penalty = 'l2'), text_train, class_train, full)
model3 = full[full['Class'] == 'Consumption']

# LogisticRegression (TFIDF)
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.5, min_df =3, ngram_range = (1,3), norm = None),LogisticRegression(C=1.0, penalty = 'l2'), text_train, class_train, full)
model4 = full[full['Class'] == 'Consumption']

# Linear SVC (CountVect)
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.5, min_df =2, ngram_range = (1,1)),LinearSVC(C=0.15), text_train, class_train, full)
model5 = full[full['Class'] == 'Consumption']

# Linear SVC (TFIDF)
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.5, min_df = 2, ngram_range = (1,3), norm = None),LinearSVC(C=0.05, max_iter = 3000), text_train, class_train, full)
model6 = full[full['Class'] == 'Consumption']

# Random Forest (CountVect)
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.5, min_df =3, ngram_range = (1,1)),RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3), text_train, class_train, full)
model7 = full[full['Class'] == 'Consumption']

# Random Forest (TFIDF)
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.5, min_df =3, ngram_range = (1,1), norm = None),RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3), text_train, class_train, full)
model8 = full[full['Class'] == 'Consumption']

# XGBoost (CountVect)
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.75, min_df =3, ngram_range = (1,2)),XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7), text_train, class_train, full)
model9 = full[full['Class'] == 'Consumption']

# XGBoost (TFIDF)
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.5, min_df =1, ngram_range = (1,2)),XGBClassifier(max_depth = 6, seed = 2, random_state=1995,colsample_bytree=0.3, subsample=0.7), text_train, class_train, full)
model10 = full[full['Class'] == 'Consumption']
In [25]:
# Concat data sets 
frames = [model1, model2, model3, model4, model5, model6, model7, model8]

result_frame = pd.concat(frames)
print("Length of classified data set before dropping duplicates: ", len(result_frame))
result_frame = result_frame.drop_duplicates(['Link'],keep= 'last')
print("Length of classified data set after dropping duplicates: ", len(result_frame))
Length of classified data set before dropping duplicates:  1198
Length of classified data set after dropping duplicates:  300
In [26]:
consumptioncsv = result_frame[['Title', 'Description', 'Link','Class']]
consumptioncsv = consumptioncsv.reset_index(drop=True)
consumptioncsv.to_csv('Consumption.csv', index = False)

Lets take a look at the data set that has been newly classified

  • It seems somewhat ok, but lets try out the ensemble methods
In [27]:
consumptioncsv.tail()
Out[27]:
Title Description Link Class
295 Nutrient intakes from food, 2015 Archived This is a health fact sheet about the nutrien... https://www150.statcan.gc.ca/n1/pub/82-625-x/2... Consumption
296 Sodium consumption at all ages Archived This article examines the amount of sodium th... https://www150.statcan.gc.ca/n1/pub/82-003-x/2... Consumption
297 Studying scenarios of nutrition intervention: ... Using data from the Canadian Community Health... https://www150.statcan.gc.ca/n1/pub/11-522-x/2... Consumption
298 The eating habits of Canadians Archived Over the past 25 years, the eating habits and... https://www150.statcan.gc.ca/n1/pub/61f0019x/6... Consumption
299 Trends and correlates of frequency of fruit an... Based on annual data from the Canadian Commun... https://www150.statcan.gc.ca/n1/pub/82-003-x/2... Consumption

Ensemble: Stacking Classifiers and Voting Classifiers

  • Using mlxtend: There are two ways of ensemble classifiers that we could use StackingClassifier and StackingCVClassifier. However, StackingClassifier in the standard stacking procedure, the first-level classifiers are fit to the same training set that is used prepare the inputs for the second-level classifier, which may lead to overfitting.

  • The more advanced way is to use StackingCVClassifier, which as explained in the mlxtend documentation uses the concept of cross-validation: the dataset is split into k folds, and in k successive rounds, k-1 folds are used to fit the first level classifier; in each round, the first-level classifiers are then applied to the remaining 1 subset that was not used for model fitting in each iteration. The resulting predictions are then stacked and provided -- as input data -- to the second-level classifier. After the training of the StackingCVClassifier, the first-level classifiers are fit to the entire dataset as illustrated in the figure below.`

  • Then there is the VotingClassifier, which picks the predictions based on the majority votes of the base classifiers (weak learners). Performance is usually not as great as a stacking classifier

In [28]:
StackingClassifier?
In [29]:
StackingCVClassifier?

Setting Up Training Data, Holdout, and New data to be Classified (CountVectorizer)

  • Note: It is not likely to have an individual vectorizer for each model. Therefore we would use the countvectorizer with the most popular tuning parameters used by the best models above (optmized using GridSearchCv)
In [50]:
# Training Data 
countvect = CountVectorizer(max_df= 0.5, min_df =3, ngram_range = (1,2))
X_train = countvect.fit_transform(text_train['processed_string'])
X_train = X_train.toarray()
y_train = class_train.replace(to_replace = "Consumption", value = 1)
y_train = y_train.replace(to_replace = "Other", value = 0)
y_train = y_train.values

X_test = countvect.transform(text_test['processed_string'])
X_test = X_test.toarray()

y_test = class_test.replace(to_replace = "Consumption", value = 1)
y_test  = y_test.replace(to_replace = "Other", value = 0)
y_test  = y_test.values

X_full = countvect.transform(full['processed_string'])
X_full = X_full.toarray()

Stacked Classifier (CountVectorizer)

Using Logistic Regression as meta_classifier

In [51]:
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier
import random
random.seed(1698)

stack_clf1 = MultinomialNB(alpha=0.25)
stack_clf2 = LogisticRegression(C=1.0, penalty = 'l2')
stack_clf3 = SVC(kernel='linear', C= 0.15, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3)
stack_clf5 = XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)

meta_clf = LogisticRegression(C=1.0, penalty = 'l2')
sclf_log = StackingCVClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
                          use_probas=True,
                          meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5, sclf_log]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'StackingClassifier']

# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)

# Predicting new data 
sclf_log = sclf_log.fit(X_train,y_train)
prediction_results_int = sclf_log.predict(X_test)
prediction_results = []

for i in prediction_results_int:
    if i == 1:
        prediction_results.append('Consumption')
    else:
        prediction_results.append('Other')
    
# Print Classification report   
print("\n")
print("Stacked Classifier (CountVect) Clasification Report (Logistic Regression Meta Classifier)")
print(classification_report(class_test.tolist(), prediction_results))
5-fold cross validation:

5-fold cross validated Accuracy: 0.86 (+/- 0.05) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.89 (+/- 0.06) [Logistic Regression]
5-fold cross validated Accuracy: 0.89 (+/- 0.02) [Linear SVC]
5-fold cross validated Accuracy: 0.80 (+/- 0.05) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.87 (+/- 0.04) [XGBoost]
5-fold cross validated Accuracy: 0.89 (+/- 0.06) [StackingClassifier]


Stacked Classifier (CountVect) Clasification Report (Logistic Regression Meta Classifier)
              precision    recall  f1-score   support

 Consumption       0.95      0.95      0.95        22
       Other       0.97      0.97      0.97        39

    accuracy                           0.97        61
   macro avg       0.96      0.96      0.96        61
weighted avg       0.97      0.97      0.97        61

Stacked Classifier (CountVectorizer)

Using XGBoost as the Meta Classifier

In [32]:
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier

stack_clf1 = MultinomialNB(alpha=0.25)
stack_clf2 = LogisticRegression(C=1.0, penalty = 'l2')
stack_clf3 = SVC(kernel='linear', C= 0.15, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3)
stack_clf5 = XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)

meta_clf = XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)

sclf_XGB = StackingCVClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
                          use_probas=True,
                          meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4,stack_clf5, sclf_XGB]

classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'StackingClassifier']

# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)

# Predicting new data 
sclf_XGB = sclf_XGB.fit(X_train,y_train)
prediction_results_int = sclf_XGB.predict(X_test)
prediction_results = []


for i in prediction_results_int:
    if i == 1:
        prediction_results.append('Consumption')
    else:
        prediction_results.append('Other')

        
# Print Classification report
print("\n")
print("Stacked Classifier (CountVect) Clasification Report (XGBoost as Meta Classifier)")
print(classification_report(class_test.tolist(), prediction_results))
5-fold cross validation:

5-fold cross validated Accuracy: 0.86 (+/- 0.05) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.89 (+/- 0.06) [Logistic Regression]
5-fold cross validated Accuracy: 0.89 (+/- 0.02) [Linear SVC]
5-fold cross validated Accuracy: 0.80 (+/- 0.05) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.87 (+/- 0.04) [XGBoost]
5-fold cross validated Accuracy: 0.88 (+/- 0.04) [StackingClassifier]


Stacked Classifier (CountVect) Clasification Report (XGBoost as Meta Classifier)
              precision    recall  f1-score   support

 Consumption       0.91      0.95      0.93        22
       Other       0.97      0.95      0.96        39

    accuracy                           0.95        61
   macro avg       0.94      0.95      0.95        61
weighted avg       0.95      0.95      0.95        61

Voting Classifier (CountVectorizer)

In [33]:
vote_clf1 = MultinomialNB(alpha=0.25)
vote_clf2 = LogisticRegression(C=1.0, penalty = 'l2')
vote_clf3 = LinearSVC(C=0.15)
vote_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3)
vote_clf5 = XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)

eclf1 = VotingClassifier(estimators=[('Multinomial Naive Bayes', vote_clf1), ('Logistic Regression Classifier', vote_clf2), ('LinearSVC', vote_clf3), ('RandomForestClassifier', vote_clf4), ('XGBoost', vote_clf5)], voting='hard')
eclf1 = eclf1.fit(X_train, y_train)

classifiers = [vote_clf1, vote_clf2, vote_clf3, vote_clf4, vote_clf5, eclf1]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier', 'XGBoost', 'VotingClassifier']

# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)

# Predicting new data 
predicted = eclf1.predict(X_test)
eclf1.score(X_test, y_test)

pred_results = []
for i in predicted:
    if i == 1:
        pred_results.append('Consumption')
    else:
        pred_results.append('Other')

        
        
# Print Classification report
print("\n")
print("Voting Classifier (CountVect) Clasification Report")
print(classification_report(class_test.tolist(),pred_results))
5-fold cross validation:

5-fold cross validated Accuracy: 0.86 (+/- 0.05) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.89 (+/- 0.06) [Logistic Regression]
5-fold cross validated Accuracy: 0.90 (+/- 0.03) [Linear SVC]
5-fold cross validated Accuracy: 0.80 (+/- 0.05) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.87 (+/- 0.04) [XGBoost]
5-fold cross validated Accuracy: 0.90 (+/- 0.06) [VotingClassifier]


Voting Classifier (CountVect) Clasification Report
              precision    recall  f1-score   support

 Consumption       0.95      0.91      0.93        22
       Other       0.95      0.97      0.96        39

    accuracy                           0.95        61
   macro avg       0.95      0.94      0.95        61
weighted avg       0.95      0.95      0.95        61

Setting Up Data (tfidfVectorizer)

In [34]:
# Training Data 
tfidf = TfidfVectorizer(max_df= 0.5, min_df =3, ngram_range = (1,3), norm = None)
X_train = tfidf.fit_transform(text_train['processed_string'])
X_train = X_train.toarray()
y_train = class_train.replace(to_replace = "Consumption", value = 1)
y_train = y_train.replace(to_replace = "Other", value = 0)
y_train = y_train.values

X_test = tfidf.transform(text_test['processed_string'])
X_test = X_test.toarray()

y_test = class_test.replace(to_replace = "Consumption", value = 1)
y_test  = y_test.replace(to_replace = "Other", value = 0)
y_test  = y_test.values

X_full = tfidf.transform(full['processed_string'])
X_full = X_full.toarray()

Stacked Classifier (tfidfVectorizer)

Using Logistic Regression as meta_classifier

In [35]:
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier

stack_clf1 = MultinomialNB(alpha=0.5)
stack_clf2 = LogisticRegression(C=1.0, penalty = 'l2')
stack_clf3 = SVC(kernel='linear', C= 0.05, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3)
stack_clf5 = XGBClassifier(max_depth = 6, seed = 2, random_state=1995,colsample_bytree=0.3, subsample=0.7)

meta_clf = LogisticRegression(C=0.5, penalty = 'l2')
sclf_tfidf = StackingCVClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
                          use_probas=True,
                          meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5, sclf_tfidf]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier', 'XGBoost', 'StackingClassifier']


# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)

# Predicting new data 
sclf_tfidf = sclf_tfidf.fit(X_train,y_train)
prediction_results_int = sclf_tfidf.predict(X_test)
prediction_results = []

for i in prediction_results_int:
    if i == 1:
        prediction_results.append('Consumption')
    else:
        prediction_results.append('Other')
        
# Print Classification report
print("\n")
print("Stacked Classifier (TF-IDF) Clasification Report (Logistic Regression as meta classifier)")
print(classification_report(class_test.tolist(), prediction_results))
5-fold cross validation:

5-fold cross validated Accuracy: 0.86 (+/- 0.07) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.89 (+/- 0.05) [Logistic Regression]
5-fold cross validated Accuracy: 0.89 (+/- 0.05) [Linear SVC]
5-fold cross validated Accuracy: 0.80 (+/- 0.05) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.88 (+/- 0.04) [XGBoost]
5-fold cross validated Accuracy: 0.90 (+/- 0.06) [StackingClassifier]


Stacked Classifier (TF-IDF) Clasification Report (Logistic Regression as meta classifier)
              precision    recall  f1-score   support

 Consumption       0.91      0.95      0.93        22
       Other       0.97      0.95      0.96        39

    accuracy                           0.95        61
   macro avg       0.94      0.95      0.95        61
weighted avg       0.95      0.95      0.95        61

Stacked Classifier (tfidfVectorizer)

Using XGBoost n as meta_classifier

In [36]:
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier

stack_clf1 = MultinomialNB(alpha=0.5)
stack_clf2 = LogisticRegression(C=1.0, penalty = 'l2')
stack_clf3 = SVC(kernel='linear', C= 0.05, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3)
stack_clf5 = XGBClassifier(max_depth = 6, seed = 2, random_state=1995,colsample_bytree=0.3, subsample=0.7)
meta_clf = XGBClassifier(random_state=1995, seed=1, colsample_bytree=0.3, subsample=0.3)

sclf_XGB_tfidf = StackingCVClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
                          use_probas=True,
                          meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5, sclf_XGB_tfidf]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'StackingClassifier']


# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)

# Predicting new data 
sclf_XGB_tfidf = sclf_XGB_tfidf.fit(X_train,y_train)
prediction_results_int = sclf_XGB_tfidf.predict(X_test)
prediction_results = []

for i in prediction_results_int:
    if i == 1:
        prediction_results.append('Consumption')
    else:
        prediction_results.append('Other')
        
# Print Classification report    
print("\n")
print("Stacked Classifier (TF-IDF) Clasification Report (XGBoost as Meta Classifier)")
print(classification_report(class_test.tolist(), prediction_results))
5-fold cross validation:

5-fold cross validated Accuracy: 0.86 (+/- 0.07) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.89 (+/- 0.05) [Logistic Regression]
5-fold cross validated Accuracy: 0.89 (+/- 0.05) [Linear SVC]
5-fold cross validated Accuracy: 0.80 (+/- 0.05) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.88 (+/- 0.04) [XGBoost]
5-fold cross validated Accuracy: 0.88 (+/- 0.05) [StackingClassifier]


Stacked Classifier (TF-IDF) Clasification Report (XGBoost as Meta Classifier)
              precision    recall  f1-score   support

 Consumption       0.91      0.91      0.91        22
       Other       0.95      0.95      0.95        39

    accuracy                           0.93        61
   macro avg       0.93      0.93      0.93        61
weighted avg       0.93      0.93      0.93        61

Voting Classifier (tfidfVectorizer)

In [37]:
vote_clf1 = MultinomialNB(alpha=0.5)
vote_clf2 = LogisticRegression(C=1.0, penalty = 'l2')
vote_clf3 = LinearSVC(C=0.05, max_iter = 3000)
vote_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3)
vote_clf5 = XGBClassifier(max_depth = 6, seed = 2, random_state=1995,colsample_bytree=0.3, subsample=0.7)


eclf1_tfidf = VotingClassifier(estimators=[('Multinomial Naive Bayes', vote_clf1), ('Logistic Regression Classifier', vote_clf2), ('LinearSVC', vote_clf3), ('RandomForestClassifier', vote_clf4), ('XGBoost',vote_clf5)], voting='hard')
eclf1_tfidf = eclf1_tfidf.fit(X_train, y_train)

classifiers = [vote_clf1, vote_clf2, vote_clf3, vote_clf4, vote_clf5, eclf1_tfidf]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'VotingClassifier']

# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)
    
# Predicting new data 
predicted = eclf1_tfidf.predict(X_test)
eclf1_tfidf.score(X_test, y_test)

pred_results = []
for i in predicted:
    if i == 1:
        pred_results.append('Consumption')
    else:
        pred_results.append('Other')

# Print Classification report
print("\n")
print("Voting Classifier (TF-IDF) Clasification Report")
print(classification_report(class_test.tolist(),pred_results))
5-fold cross validation:

5-fold cross validated Accuracy: 0.86 (+/- 0.07) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.89 (+/- 0.05) [Logistic Regression]
5-fold cross validated Accuracy: 0.88 (+/- 0.03) [Linear SVC]
5-fold cross validated Accuracy: 0.80 (+/- 0.05) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.88 (+/- 0.04) [XGBoost]
5-fold cross validated Accuracy: 0.89 (+/- 0.05) [VotingClassifier]


Voting Classifier (TF-IDF) Clasification Report
              precision    recall  f1-score   support

 Consumption       0.90      0.86      0.88        22
       Other       0.93      0.95      0.94        39

    accuracy                           0.92        61
   macro avg       0.91      0.91      0.91        61
weighted avg       0.92      0.92      0.92        61

Appending the New Categories on to the Data set

  • We pick the one model that has the highest precision/f1-score to be our model
In [52]:
full['BestModelClassification'] = sclf_log.predict(X_full)
full['BestModelClassification'] = full['BestModelClassification'] .replace(to_replace = 1, value = "Consumption")
full['BestModelClassification'] = full['BestModelClassification'] .replace(to_replace = 0, value = "Other")
print(len(full[full['BestModelClassification'] == 'Consumption']))
full['BestModelClassification'].value_counts()
152
Out[52]:
Other          1545
Consumption     152
Name: BestModelClassification, dtype: int64
In [53]:
df = full[full['BestModelClassification'] == 'Consumption']
df = df.reset_index(drop=True)
In [54]:
df.to_csv("BestConsumption.csv", index = False)