-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
107 lines (87 loc) · 4.38 KB
/
main.py
File metadata and controls
107 lines (87 loc) · 4.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from myfunctions import *
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import numpy as np
#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------
# 1. Data Collection and Preparation(Pre-Processing):
#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------
# Combine and save data from scrapped articles to csv file "uncleaned_data.csv"
filename = 'uncleaned_data.csv'
filename_preprocess = 'preprocessed_data.csv'
combine_and_save_data(filename)
# Read the uncleaned data
uncleaned_data = pd.read_csv(filename)
# Pre-Processing data on train
preprocessed_data = preprocess_data(uncleaned_data)
# Save preprocessed train dataset to CSV
save_preprocessed_data(preprocessed_data, filename_preprocess)
preprocessed_df = pd.DataFrame(preprocessed_data)
# Separating into train and test data
train_set = preprocessed_df.iloc[:36] # Access the first 12 rows
test_set = preprocessed_df.iloc[36:] # Access the remaining rows
#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------
# 2. Graph Construction:
#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------
# Generate the graph for the training set
train_graphs = []
for index, row in train_set.iterrows():
# Build the directed graph
graph = construct_graph(row['content_tokens'])
train_graphs.append(graph)
# Generate the graph for the test set
test_graphs = []
for index, row in test_set.iterrows():
# Build the directed graph
graph = construct_graph(row['content_tokens'])
test_graphs.append(graph)
# Plot a graph from the training set for visualization
plot_graph(train_graphs[2])
#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------
# 3. Classification with KNN:
#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------
# Extracting labels
train_labels = train_set['label'].tolist()
test_labels = test_set['label'].tolist()
# Classification
i = 0
k = 3
predicted_labels = []
true_labels = []
for test_instance in test_graphs:
predicted_label = knn(train_graphs, test_instance, k, train_labels)
true_label = test_labels[i]
i += 1
predicted_labels.append(predicted_label)
true_labels.append(true_label)
print(f'Predicted class: {predicted_label} ------- Actual Class: {true_label}')
#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------
# 3. Evaluation:
#--------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------
# Evaluation
accuracy = accuracy_score(true_labels, predicted_labels)
accuracy_percentage = accuracy * 100
print("Accuracy: ", accuracy_percentage)
# Compute evaluation metrics
report = classification_report(test_labels, predicted_labels)
# Print classification report
print("Classification Report:")
print(report)
# Compute confusion matrix
cm = confusion_matrix(test_labels, predicted_labels)
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=np.unique(train_labels), yticklabels=np.unique(train_labels))
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()