Classification_of_Documents_Using_Graph-Based-Features_and_KNN_GT/main.py at main · ali7haider/Classification_of_Documents_Using_Graph-Based-Features_and_KNN_GT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from myfunctions import *
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import numpy as np
#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------
#                               1. Data Collection and Preparation(Pre-Processing):
#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------

# Combine and save data from scrapped articles to csv file "uncleaned_data.csv"
filename = 'uncleaned_data.csv'
filename_preprocess = 'preprocessed_data.csv'
combine_and_save_data(filename)

# Read the uncleaned data
uncleaned_data = pd.read_csv(filename)

# Pre-Processing data on train
preprocessed_data = preprocess_data(uncleaned_data)

# Save preprocessed train dataset to CSV
save_preprocessed_data(preprocessed_data, filename_preprocess)

preprocessed_df = pd.DataFrame(preprocessed_data)

# Separating into train and test data
train_set = preprocessed_df.iloc[:36]  # Access the first 12 rows
test_set = preprocessed_df.iloc[36:]   # Access the remaining rows


#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------
#                               2. Graph Construction:
#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------

# Generate the graph for the training set
train_graphs = []
for index, row in train_set.iterrows():
    # Build the directed graph
    graph = construct_graph(row['content_tokens'])
    train_graphs.append(graph)

# Generate the graph for the test set
test_graphs = []
for index, row in test_set.iterrows():
    # Build the directed graph
    graph = construct_graph(row['content_tokens'])
    test_graphs.append(graph)

# Plot a graph from the training set for visualization
plot_graph(train_graphs[2])

#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------
#                               3. Classification with KNN:
#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------

# Extracting labels
train_labels = train_set['label'].tolist()
test_labels = test_set['label'].tolist()

# Classification
i = 0
k = 3
predicted_labels = []
true_labels = []
for test_instance in test_graphs:
    predicted_label = knn(train_graphs, test_instance, k, train_labels)
    true_label = test_labels[i]
    i += 1
    predicted_labels.append(predicted_label)
    true_labels.append(true_label)
    print(f'Predicted class: {predicted_label} ------- Actual Class: {true_label}')

#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------
#                               3. Evaluation:
#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------

# Evaluation
accuracy = accuracy_score(true_labels, predicted_labels)
accuracy_percentage = accuracy * 100
print("Accuracy: ", accuracy_percentage)

# Compute evaluation metrics
report = classification_report(test_labels, predicted_labels)

# Print classification report
print("Classification Report:")
print(report)

# Compute confusion matrix
cm = confusion_matrix(test_labels, predicted_labels)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=np.unique(train_labels), yticklabels=np.unique(train_labels))
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()