-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSentiment analysis (Tfidf).py
More file actions
101 lines (88 loc) · 3.97 KB
/
Sentiment analysis (Tfidf).py
File metadata and controls
101 lines (88 loc) · 3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 8 16:27:11 2023
@author: 91944
"""
import pandas as pd
import numpy as np
import streamlit as st
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lm=WordNetLemmatizer()
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
le=LabelEncoder()
import pickle
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import gensim.downloader
import gensim
from gensim.models import Word2Vec, KeyedVectors
#Text Cleaning
def text_cleaning(Text):
#removal of the link
Text = re.sub('https?://\S+|www\.\S+', '', Text)
#removal of punctuatoins
punc=str.maketrans(string.punctuation,' '*len(string.punctuation))
Text=Text.translate(punc)
#removal of numbers
Text = re.sub(r'\d+', '', Text)
#removal of the special characters
Text=re.sub(r'[^\w\s]', '', Text)
#lower case transformation
Text=Text.lower()
#remove the non-english alphabets
Text=re.sub(r'[^\u0000-\u007F]+', '', Text)
return Text
Stopwords = (set(nltk.corpus.stopwords.words("english")))
Stopwords.remove('not')
Stopwords.remove('down')
Stopwords.remove("more")
Stopwords.remove("under")
domain_words=['finnish','russian','finland','russia','swedish','firm','eighteen','months','taking','total','square',
'eur','million','announcement','day','earlier','glaston','net','third','quarter','dropped','mln','euro',
'period','april','baltic','countries','eur mn','last','year','million','state',
'office','msft','orcl','goog','crm','adbe','aapl','afternoon','esi','billion','eurm','third','quarter',
'half','annually','annualy','first','second','nine','helsinki','omx','year','month','day','indian','india','third'
,'fourth','mn','mln','in','eur','euro','months','goods','one','the', 'of', 'in', 'to', 'and', 'a','eur', 'for',
's', 'is', 'on', 'from', 'will', 'company', 'as', 'mn', 'its', 'with', 'by', 'be', 'has', 'at','it', 'said',
'million', 'net', 'year', 'm', 'that', 'was', 'group', 'an', 'mln','new', 'are', 'quarter','this', 'oyj','also',
'have', 'which', 'first', 'euro', 'today', 'been', 'about', 'helsinki', 'per','total', 'after', 'nokia', 'bank',
'based', 'were', 'we', 'than', 'some','or', 'other', 'all', 'one', 'hel' ,'our', 'plc', 'now', 'last', 'their',
'second', 'ceo', 'pct', 'january', 'into', 'aapl', 'would', 'eurm', 'out', 'part', 'oy','i','september', 'usd',
'two', 'third','earlier', 'can', 'time', 'billion','had', 'omx','us', 'russia', 'may','annual', 'day', 'both',
'tsla','while', 'before','months', 'number', 'march', 'october', 'euros',
'they','through', 'april']
Stopwords.update(domain_words)
def Text_Processing(Text):
Processed_Text = list()
Lemmatizer = WordNetLemmatizer()
# Tokens of Words
Tokens = nltk.word_tokenize(Text)
for word in Tokens:
if word not in Stopwords:
Processed_Text.append(Lemmatizer.lemmatize(word))
return(" ".join(Processed_Text))
tfidf = pickle.load(open('tf_idf_model.pkl','rb'))
model = pickle.load(open('SVM_Tfidf_Clf.pkl','rb'))
st.title("Sentiment Analyzer")
st.subheader('for financial Texts')
text_input = st.text_area("Enter the message")
if st.button('Analyze'):
# 1. preprocess
cleaned_text = text_cleaning(text_input)
processed_text=Text_Processing(cleaned_text)
# 2. vectorize
vector_input = tfidf.transform([processed_text])
# 3. predict
result = model.predict(vector_input)[0]
# 4. Display
if result == 0:
st.header('Negative Statement')
elif result == 1:
st.header('Neutral statement')
elif result == 2:
st.header('Positive statement')