feat(crawl_terraform): Add Crawl Terraform (#87)

fatemehAminy · web-flow · commit 6a2c66808f37 · 2024-11-22T21:29:40.000+03:30
* crawl(terraform):crawl terraform docs -first part

* crawl(terraform) add crawl for more info

* feat(crawl_terraform) craw terraform docs

* feat(crawl_terraform) add readme

* feat(crawl_terraform) add default csv file
diff --git a/crawl/content_parser.py b/crawl/content_parser.py
@@ -0,0 +1,105 @@
+import requests
+from bs4 import BeautifulSoup
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+
+class WebContentParser:
+    def __init__(self, url):
+        self.url = url
+        self.headers = {
+            'User-Agent': (
+                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) '
+                'AppleWebKit/537.36 (KHTML, like Gecko) '
+                'Chrome/50.0.2661.102 Safari/537.36'
+            )
+        }
+        self.session = self._initialize_session()
+        self.main_response = None
+        self.all_page_data = []
+
+    def _initialize_session(self):
+        """Set up the session with retry strategy."""
+        retry_strategy = Retry(
+            total=5,
+            backoff_factor=8,
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        adapter.max_retries.respect_retry_after_header = False
+
+        session = requests.Session()
+        session.mount("https://", adapter)
+        session.mount("http://", adapter)
+        return session
+
+    def fetch_content(self):
+        """Fetch the main content from the URL."""
+        try:
+            self.main_response = self.session.get(
+                self.url, verify=False, timeout=30, headers=self.headers
+            )
+            print(f'URL fetched: {self.url}')
+            return self.main_response
+        except requests.RequestException as e:
+            print(f"Failed to fetch the URL: {e}")
+            return None
+
+    def parse_content(self):
+        """Parse the fetched HTML content."""
+        if not self.main_response:
+            print("No response available to parse.")
+            return []
+
+        main_soup = BeautifulSoup(self.main_response.content, 'html.parser')
+        datas = main_soup.find('main', {'id': 'main'})
+        if not datas:
+            print("No 'main' element found.")
+            return []
+
+        all_tag = datas.find_all(['h1', 'h2', 'h3', 'p', 'blockquote', 'ul'])
+        each_title_data = {}
+
+        for tag in all_tag:
+            if tag.name in ['h1', 'h2']:
+                if each_title_data:
+                    self.all_page_data.append(each_title_data)
+                    each_title_data = {}
+                each_title_data['metadata'] = tag.text.strip()
+
+            elif tag.name == 'h3':
+                if tag.text.strip() == 'Resources':
+                    each_title_data[tag.text.strip()] = ''
+                else:
+                    if each_title_data:
+                        self.all_page_data.append(each_title_data)
+                        each_title_data = {}
+                    each_title_data['metadata'] = tag.text.strip()
+
+            elif tag.name in ['p', 'blockquote']:
+                num = len(each_title_data)
+                key = f'content {num}'
+                if tag.text.strip():
+                    each_title_data[key] = tag.text.strip()
+
+            elif tag.name == 'ul':
+                text = ' '.join(
+                    li.text.strip()
+                    for li in tag.find_all('li', {'class': 'mdx-lists_listItem__nkqhg'})
+                )
+                if 'Resources' in each_title_data:
+                    each_title_data['Resources'] = text
+                else:
+                    num = len(each_title_data)
+                    key = f'content {num}'
+                    if text:
+                        each_title_data[key] = text
+
+        if each_title_data:
+            self.all_page_data.append(each_title_data)
+
+        return self.all_page_data
+
+    def get_data(self):
+        """Main method to fetch and parse content."""
+        self.fetch_content()
+        return self.parse_content()
+
diff --git a/crawl/main.py b/crawl/main.py
@@ -0,0 +1,92 @@
+
+import argparse
+import csv
+import logging
+import requests
+from bs4 import BeautifulSoup
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+from content_parser import WebContentParser
+
+
+def setup_logging():
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(levelname)s - %(message)s',
+        handlers=[logging.StreamHandler()]
+    )
+
+
+def setup_http_session():
+    retry_strategy = Retry(
+        total=5,
+        backoff_factor=8,
+    )
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+    adapter.max_retries.respect_retry_after_header = False
+    session = requests.Session()
+    session.mount("https://", adapter)
+    session.mount("http://", adapter)
+    return session
+
+
+def process_urls(file_path, save_result):
+    http = setup_http_session()
+    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
+
+    with open(file_path, 'r') as file:
+        csv_reader = csv.reader(file)
+        for row in csv_reader:
+            if row:  # Check if the row is not empty
+                main_url = row[0]
+                try:
+                    main_response = http.get(main_url, verify=False, timeout=30, headers=headers)
+                    logging.info(f'Fetched URL: {main_url}')
+                except requests.RequestException as e:
+                    logging.error(f"Failed to fetch URL {main_url}: {e}")
+                    continue
+
+                main_soup = BeautifulSoup(main_response.content, 'html.parser')
+                products = main_soup.find('div', {'class': 'marketing-content_root__DE3hU'}).find_all('div', {'class': 'card-grid-block_root__yDdm_'})
+                logging.info(f'Found {len(products)} products on page: {main_url}')
+                all_data = []
+                for product in products:
+                    # Get org title
+                    title = product.find('h2').text
+                    sub_content_link=[]
+                    all_sub_title = product.find_all('li')
+                    for res in all_sub_title:
+                        sub_part_content = {}
+                        sub_part_content['main_title'] = title
+                        sub_title = res.find('span', {'class': 'card-title_text__F97Wj'}).get_text()
+                        sub_part_content['sub_title'] = sub_title
+                        sub_title_link = 'https://developer.hashicorp.com' + res.find('a').attrs['href']
+                        sub_part_content['sub_title_link'] = sub_title_link
+
+                        parser = WebContentParser(sub_title_link)
+                        data = parser.get_data()
+                        sub_part_content['all_data_info'] = data
+
+                        logging.info(f'Parsed content for sub-title: {sub_title}')
+                        sub_content_link.append(sub_part_content)
+                    all_data.append(sub_content_link)
+                if save_result:
+                    # Logic to save sub_part_content goes here (e.g., writing to a file or database)
+                    logging.info(f'Saving result for: {all_data}')
+                else:
+                    print(all_data)
+                          
+
+def main():
+    setup_logging()
+
+    parser = argparse.ArgumentParser(description='Process URLs from a CSV file.')
+    parser.add_argument('--csv_path', type=str, default='./urls.csv', help='Path to the CSV file containing URLs')
+    parser.add_argument('--save_result', type=bool, default=False, help='Flag to indicate if the results should be saved')
+    args = parser.parse_args()
+
+    process_urls(args.csv_path, args.save_result)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/crawl/readme.md b/crawl/readme.md
@@ -0,0 +1,39 @@
+# Documentation for Web Content Scraper
+
+## Overview
+This script is designed to scrape data from a list of URLs provided in a CSV file. It fetches the content, extracts specific product information, and logs the operations performed. Optionally, the extracted content can also be saved. The script utilizes various libraries such as `requests`, `BeautifulSoup`, and `argparse` to ensure efficient and robust operation.
+
+## Prerequisites
+Make sure the following Python packages are installed:
+- `requests`
+- `beautifulsoup4`
+- `urllib3`
+
+To install the dependencies, run the following command:
+```sh
+pip install requests beautifulsoup4
+```
+## How to Use
+Arguments
+The script accepts command-line arguments that allow customization of behavior:
+--csv_path: The path to the CSV file containing URLs to scrape. The default value is ./urls.csv.
+--save_result: A boolean flag indicating whether to save the scraped results. The default value is False.
+## Running the Script
+You can run the script by using the following command:
+
+```sh
+Copy code
+python main.py --csv_path <path_to_csv> --save_result <True/False>
+```
+For example:
+```sh
+Copy code
+python main.py --csv_path ./urls.csv --save_result True
+```
+## CSV File Format
+The CSV file should contain a list of URLs, with each URL on a new line. Here is an example:
+```
+https://example.com/page1
+https://example.com/page2
+```
+
diff --git a/crawl/urls.csv b/crawl/urls.csv
@@ -0,0 +1 @@
+https://developer.hashicorp.com/terraform/docs

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+https://developer.hashicorp.com/terraform/docs`