# import the .py file which stores all our functions and library imports
from ThematicTextClassify.TextClassifier import *
from ThematicTextClassify.Preprocessing import *
df =pd.read_csv('Categorized_Links.csv')
# read the training data
# Add title and description column together to form a text column as our document
df['Text'] = df['Title']+ df['Description']
df = df.dropna(subset= ['Text'], axis = 0)
# preprocess the newly defined Text column
# preprocess_text is a function written and saved in TextClassifier.py
df['Processed Text'] = df['Text'].map(preprocess_text)
# turn list to string format
df['processed_string'] = [' '.join(text) for text in df['Processed Text']]
df['Class'] = ""
df['Class'] = df.apply(lambda df: 'Consumption' if (df['Category'] == 'Consumption') else df['Class'], axis =1)
df['Class'] = df.apply(lambda df: 'Other' if (df['Category'] != 'Consumption') else df['Class'], axis =1)
# drop duplicates we want consumption all at the top of the data set
# therefore we can drop duplicates by the keeping the first one we see
df = df.reset_index(drop=True)
df = df.sort_values('Class')
df = df.reset_index(drop=True)
df = df.drop_duplicates(['Link'],keep= 'first')
df = df.reset_index(drop=True)
len(df)
# split data set using train_test_split from sklearn
text_train, text_test, class_train, class_test = train_test_split(df,
df['Class'],
test_size=0.20,
random_state=509)
# Load data to be classified
full = pd.read_csv("NewData.csv")
# combine the title and description as text as our document
full['Text'] = full['Title'] +full['Description']
full['Processed Text'] = full['Text'].map(preprocess_text)
full['processed_string'] = [' '.join(text) for text in full['Processed Text']]
full = full.reset_index(drop=True)
# MultinomialNB (CountVect)
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.25, min_df =1, ngram_range = (2,3)),MultinomialNB(alpha=0.25), text_train, class_train, full )
model1 = full[full['Class'] == 'Consumption']
# MultinomialNB (TFIDF)
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.25, min_df =1, ngram_range = (2,3)),MultinomialNB(alpha=0.5), text_train, class_train, full)
model2 = full[full['Class'] == 'Consumption']
# LogisticRegression (CountVect)
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.5, min_df =1, ngram_range = (1,3)), LogisticRegression(C=1.0, penalty = 'l2'), text_train, class_train, full)
model3 = full[full['Class'] == 'Consumption']
# LogisticRegression (TFIDF)
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.5, min_df =3, ngram_range = (1,3), norm = None),LogisticRegression(C=1.0, penalty = 'l2'), text_train, class_train, full)
model4 = full[full['Class'] == 'Consumption']
# Linear SVC (CountVect)
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.5, min_df =2, ngram_range = (1,1)),LinearSVC(C=0.15), text_train, class_train, full)
model5 = full[full['Class'] == 'Consumption']
# Linear SVC (TFIDF)
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.5, min_df = 2, ngram_range = (1,3), norm = None),LinearSVC(C=0.05, max_iter = 3000), text_train, class_train, full)
model6 = full[full['Class'] == 'Consumption']
# Random Forest (CountVect)
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.5, min_df =3, ngram_range = (1,1)),RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3), text_train, class_train, full)
model7 = full[full['Class'] == 'Consumption']
# Random Forest (TFIDF)
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.5, min_df =3, ngram_range = (1,1), norm = None),RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3), text_train, class_train, full)
model8 = full[full['Class'] == 'Consumption']
# XGBoost (CountVect)
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.75, min_df =3, ngram_range = (1,2)),XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7), text_train, class_train, full)
model9 = full[full['Class'] == 'Consumption']
# XGBoost (TFIDF)
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.5, min_df =1, ngram_range = (1,2)),XGBClassifier(max_depth = 6, seed = 2, random_state=1995,colsample_bytree=0.3, subsample=0.7), text_train, class_train, full)
model10 = full[full['Class'] == 'Consumption']
# Concat data sets
frames = [model1, model2, model3, model4, model5, model6, model7, model8]
result_frame = pd.concat(frames)
print("Length of classified data set before dropping duplicates: ", len(result_frame))
result_frame = result_frame.drop_duplicates(['Link'],keep= 'last')
print("Length of classified data set after dropping duplicates: ", len(result_frame))
consumptioncsv = result_frame[['Title', 'Description', 'Link','Class']]
consumptioncsv = consumptioncsv.reset_index(drop=True)
consumptioncsv.to_csv('Consumption.csv', index = False)
consumptioncsv.tail()
Using mlxtend: There are two ways of ensemble classifiers that we could use StackingClassifier
and StackingCVClassifier
. However, StackingClassifier
in the standard stacking procedure, the first-level classifiers are fit to the same training set that is used prepare the inputs for the second-level classifier, which may lead to overfitting.
The more advanced way is to use StackingCVClassifier
, which as explained in the mlxtend documentation uses the concept of cross-validation: the dataset is split into k folds, and in k successive rounds, k-1 folds are used to fit the first level classifier; in each round, the first-level classifiers are then applied to the remaining 1 subset that was not used for model fitting in each iteration. The resulting predictions are then stacked and provided -- as input data -- to the second-level classifier. After the training of the StackingCVClassifier
, the first-level classifiers are fit to the entire dataset as illustrated in the figure below.`
Then there is the VotingClassifier
, which picks the predictions based on the majority votes of the base classifiers (weak learners). Performance is usually not as great as a stacking classifier
StackingClassifier?
StackingCVClassifier?
# Training Data
countvect = CountVectorizer(max_df= 0.5, min_df =3, ngram_range = (1,2))
X_train = countvect.fit_transform(text_train['processed_string'])
X_train = X_train.toarray()
y_train = class_train.replace(to_replace = "Consumption", value = 1)
y_train = y_train.replace(to_replace = "Other", value = 0)
y_train = y_train.values
X_test = countvect.transform(text_test['processed_string'])
X_test = X_test.toarray()
y_test = class_test.replace(to_replace = "Consumption", value = 1)
y_test = y_test.replace(to_replace = "Other", value = 0)
y_test = y_test.values
X_full = countvect.transform(full['processed_string'])
X_full = X_full.toarray()
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier
import random
random.seed(1698)
stack_clf1 = MultinomialNB(alpha=0.25)
stack_clf2 = LogisticRegression(C=1.0, penalty = 'l2')
stack_clf3 = SVC(kernel='linear', C= 0.15, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3)
stack_clf5 = XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)
meta_clf = LogisticRegression(C=1.0, penalty = 'l2')
sclf_log = StackingCVClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
use_probas=True,
meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5, sclf_log]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'StackingClassifier']
# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)
# Predicting new data
sclf_log = sclf_log.fit(X_train,y_train)
prediction_results_int = sclf_log.predict(X_test)
prediction_results = []
for i in prediction_results_int:
if i == 1:
prediction_results.append('Consumption')
else:
prediction_results.append('Other')
# Print Classification report
print("\n")
print("Stacked Classifier (CountVect) Clasification Report (Logistic Regression Meta Classifier)")
print(classification_report(class_test.tolist(), prediction_results))
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier
stack_clf1 = MultinomialNB(alpha=0.25)
stack_clf2 = LogisticRegression(C=1.0, penalty = 'l2')
stack_clf3 = SVC(kernel='linear', C= 0.15, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3)
stack_clf5 = XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)
meta_clf = XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)
sclf_XGB = StackingCVClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
use_probas=True,
meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4,stack_clf5, sclf_XGB]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'StackingClassifier']
# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)
# Predicting new data
sclf_XGB = sclf_XGB.fit(X_train,y_train)
prediction_results_int = sclf_XGB.predict(X_test)
prediction_results = []
for i in prediction_results_int:
if i == 1:
prediction_results.append('Consumption')
else:
prediction_results.append('Other')
# Print Classification report
print("\n")
print("Stacked Classifier (CountVect) Clasification Report (XGBoost as Meta Classifier)")
print(classification_report(class_test.tolist(), prediction_results))
vote_clf1 = MultinomialNB(alpha=0.25)
vote_clf2 = LogisticRegression(C=1.0, penalty = 'l2')
vote_clf3 = LinearSVC(C=0.15)
vote_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3)
vote_clf5 = XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)
eclf1 = VotingClassifier(estimators=[('Multinomial Naive Bayes', vote_clf1), ('Logistic Regression Classifier', vote_clf2), ('LinearSVC', vote_clf3), ('RandomForestClassifier', vote_clf4), ('XGBoost', vote_clf5)], voting='hard')
eclf1 = eclf1.fit(X_train, y_train)
classifiers = [vote_clf1, vote_clf2, vote_clf3, vote_clf4, vote_clf5, eclf1]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier', 'XGBoost', 'VotingClassifier']
# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)
# Predicting new data
predicted = eclf1.predict(X_test)
eclf1.score(X_test, y_test)
pred_results = []
for i in predicted:
if i == 1:
pred_results.append('Consumption')
else:
pred_results.append('Other')
# Print Classification report
print("\n")
print("Voting Classifier (CountVect) Clasification Report")
print(classification_report(class_test.tolist(),pred_results))
# Training Data
tfidf = TfidfVectorizer(max_df= 0.5, min_df =3, ngram_range = (1,3), norm = None)
X_train = tfidf.fit_transform(text_train['processed_string'])
X_train = X_train.toarray()
y_train = class_train.replace(to_replace = "Consumption", value = 1)
y_train = y_train.replace(to_replace = "Other", value = 0)
y_train = y_train.values
X_test = tfidf.transform(text_test['processed_string'])
X_test = X_test.toarray()
y_test = class_test.replace(to_replace = "Consumption", value = 1)
y_test = y_test.replace(to_replace = "Other", value = 0)
y_test = y_test.values
X_full = tfidf.transform(full['processed_string'])
X_full = X_full.toarray()
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier
stack_clf1 = MultinomialNB(alpha=0.5)
stack_clf2 = LogisticRegression(C=1.0, penalty = 'l2')
stack_clf3 = SVC(kernel='linear', C= 0.05, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3)
stack_clf5 = XGBClassifier(max_depth = 6, seed = 2, random_state=1995,colsample_bytree=0.3, subsample=0.7)
meta_clf = LogisticRegression(C=0.5, penalty = 'l2')
sclf_tfidf = StackingCVClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
use_probas=True,
meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5, sclf_tfidf]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier', 'XGBoost', 'StackingClassifier']
# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)
# Predicting new data
sclf_tfidf = sclf_tfidf.fit(X_train,y_train)
prediction_results_int = sclf_tfidf.predict(X_test)
prediction_results = []
for i in prediction_results_int:
if i == 1:
prediction_results.append('Consumption')
else:
prediction_results.append('Other')
# Print Classification report
print("\n")
print("Stacked Classifier (TF-IDF) Clasification Report (Logistic Regression as meta classifier)")
print(classification_report(class_test.tolist(), prediction_results))
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier
stack_clf1 = MultinomialNB(alpha=0.5)
stack_clf2 = LogisticRegression(C=1.0, penalty = 'l2')
stack_clf3 = SVC(kernel='linear', C= 0.05, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3)
stack_clf5 = XGBClassifier(max_depth = 6, seed = 2, random_state=1995,colsample_bytree=0.3, subsample=0.7)
meta_clf = XGBClassifier(random_state=1995, seed=1, colsample_bytree=0.3, subsample=0.3)
sclf_XGB_tfidf = StackingCVClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
use_probas=True,
meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5, sclf_XGB_tfidf]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'StackingClassifier']
# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)
# Predicting new data
sclf_XGB_tfidf = sclf_XGB_tfidf.fit(X_train,y_train)
prediction_results_int = sclf_XGB_tfidf.predict(X_test)
prediction_results = []
for i in prediction_results_int:
if i == 1:
prediction_results.append('Consumption')
else:
prediction_results.append('Other')
# Print Classification report
print("\n")
print("Stacked Classifier (TF-IDF) Clasification Report (XGBoost as Meta Classifier)")
print(classification_report(class_test.tolist(), prediction_results))
vote_clf1 = MultinomialNB(alpha=0.5)
vote_clf2 = LogisticRegression(C=1.0, penalty = 'l2')
vote_clf3 = LinearSVC(C=0.05, max_iter = 3000)
vote_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3)
vote_clf5 = XGBClassifier(max_depth = 6, seed = 2, random_state=1995,colsample_bytree=0.3, subsample=0.7)
eclf1_tfidf = VotingClassifier(estimators=[('Multinomial Naive Bayes', vote_clf1), ('Logistic Regression Classifier', vote_clf2), ('LinearSVC', vote_clf3), ('RandomForestClassifier', vote_clf4), ('XGBoost',vote_clf5)], voting='hard')
eclf1_tfidf = eclf1_tfidf.fit(X_train, y_train)
classifiers = [vote_clf1, vote_clf2, vote_clf3, vote_clf4, vote_clf5, eclf1_tfidf]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'VotingClassifier']
# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)
# Predicting new data
predicted = eclf1_tfidf.predict(X_test)
eclf1_tfidf.score(X_test, y_test)
pred_results = []
for i in predicted:
if i == 1:
pred_results.append('Consumption')
else:
pred_results.append('Other')
# Print Classification report
print("\n")
print("Voting Classifier (TF-IDF) Clasification Report")
print(classification_report(class_test.tolist(),pred_results))
full['BestModelClassification'] = sclf_log.predict(X_full)
full['BestModelClassification'] = full['BestModelClassification'] .replace(to_replace = 1, value = "Consumption")
full['BestModelClassification'] = full['BestModelClassification'] .replace(to_replace = 0, value = "Other")
print(len(full[full['BestModelClassification'] == 'Consumption']))
full['BestModelClassification'].value_counts()
df = full[full['BestModelClassification'] == 'Consumption']
df = df.reset_index(drop=True)
df.to_csv("BestConsumption.csv", index = False)