TF-Tool/tf-idf at main · comfastshop/TF-Tool · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import re

class SEOTfIdfAnalyzer:
    def __init__(self):
        # Common English stop words to filter out "noise"
        self.stop_words = 'english'

    def extract_clean_text(self, url):
        """Fetches the webpage and removes HTML tags/scripts."""
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            # Remove non-content elements
            for element in soup(["script", "style", "header", "footer", "nav"]):
                element.decompose()

            # Get text and clean non-alphabetic characters
            text = soup.get_text(separator=' ')
            clean_text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
            return clean_text
        except Exception as e:
            print(f"Error crawling {url}: {e}")
            return ""

    def run_analysis(self, target_url, competitor_urls):
        """
        Calculates TF-IDF for the target site against a corpus of competitors.
        """
        print(f"Analyzing Target: {target_url}")

        # 1. Collect Corpus (Target + Competitors)
        documents = []
        target_content = self.extract_clean_text(target_url)
        if not target_content:
            return

        documents.append(target_content)

        for url in competitor_urls:
            print(f"Scraping Competitor: {url}")
            content = self.extract_clean_text(url)
            if content:
                documents.append(content)

        # 2. Initialize TF-IDF Vectorizer
        vectorizer = TfidfVectorizer(stop_words=self.stop_words, ngram_range=(1, 2))

        # 3. Fit and Transform the text
        tfidf_matrix = vectorizer.fit_transform(documents)
        feature_names = vectorizer.get_feature_names_out()

        # 4. Extract scores for the target website (Index 0)
        target_vector = tfidf_matrix[0].toarray()[0]
        results = sorted(zip(feature_names, target_vector), key=lambda x: x[1], reverse=True)

        # 5. Display Top 20 Keywords
        print("\n" + "="*40)
        print(f"{'Keyword/Phrase':<25} | {'TF-IDF Score':<10}")
        print("-" * 40)
        for word, score in results[:20]:
            if score > 0:
                print(f"{word:<25} | {score:.4f}")

# --- Execution ---
if __name__ == "__main__":
    analyzer = SEOTfIdfAnalyzer()

    # Replace with your actual URLs
    my_page = "https://www.example.com/seo-guide"
    competitor_pages = [
        "https://www.competitorA.com/seo-basics",
        "https://www.competitorB.com/what-is-seo"
    ]

    analyzer.run_analysis(my_page, competitor_pages)