-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtf-idf
More file actions
82 lines (66 loc) · 2.88 KB
/
tf-idf
File metadata and controls
82 lines (66 loc) · 2.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import re
class SEOTfIdfAnalyzer:
def __init__(self):
# Common English stop words to filter out "noise"
self.stop_words = 'english'
def extract_clean_text(self, url):
"""Fetches the webpage and removes HTML tags/scripts."""
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Remove non-content elements
for element in soup(["script", "style", "header", "footer", "nav"]):
element.decompose()
# Get text and clean non-alphabetic characters
text = soup.get_text(separator=' ')
clean_text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
return clean_text
except Exception as e:
print(f"Error crawling {url}: {e}")
return ""
def run_analysis(self, target_url, competitor_urls):
"""
Calculates TF-IDF for the target site against a corpus of competitors.
"""
print(f"Analyzing Target: {target_url}")
# 1. Collect Corpus (Target + Competitors)
documents = []
target_content = self.extract_clean_text(target_url)
if not target_content:
return
documents.append(target_content)
for url in competitor_urls:
print(f"Scraping Competitor: {url}")
content = self.extract_clean_text(url)
if content:
documents.append(content)
# 2. Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words=self.stop_words, ngram_range=(1, 2))
# 3. Fit and Transform the text
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()
# 4. Extract scores for the target website (Index 0)
target_vector = tfidf_matrix[0].toarray()[0]
results = sorted(zip(feature_names, target_vector), key=lambda x: x[1], reverse=True)
# 5. Display Top 20 Keywords
print("\n" + "="*40)
print(f"{'Keyword/Phrase':<25} | {'TF-IDF Score':<10}")
print("-" * 40)
for word, score in results[:20]:
if score > 0:
print(f"{word:<25} | {score:.4f}")
# --- Execution ---
if __name__ == "__main__":
analyzer = SEOTfIdfAnalyzer()
# Replace with your actual URLs
my_page = "https://www.example.com/seo-guide"
competitor_pages = [
"https://www.competitorA.com/seo-basics",
"https://www.competitorB.com/what-is-seo"
]
analyzer.run_analysis(my_page, competitor_pages)