Skip to content

Commit 6a2c668

Browse files
authored
feat(crawl_terraform): Add Crawl Terraform (#87)
* crawl(terraform):crawl terraform docs -first part * crawl(terraform) add crawl for more info * feat(crawl_terraform) craw terraform docs * feat(crawl_terraform) add readme * feat(crawl_terraform) add default csv file
1 parent 2a365ee commit 6a2c668

4 files changed

Lines changed: 237 additions & 0 deletions

File tree

crawl/content_parser.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
from requests.adapters import HTTPAdapter
4+
from requests.packages.urllib3.util.retry import Retry
5+
6+
class WebContentParser:
7+
def __init__(self, url):
8+
self.url = url
9+
self.headers = {
10+
'User-Agent': (
11+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) '
12+
'AppleWebKit/537.36 (KHTML, like Gecko) '
13+
'Chrome/50.0.2661.102 Safari/537.36'
14+
)
15+
}
16+
self.session = self._initialize_session()
17+
self.main_response = None
18+
self.all_page_data = []
19+
20+
def _initialize_session(self):
21+
"""Set up the session with retry strategy."""
22+
retry_strategy = Retry(
23+
total=5,
24+
backoff_factor=8,
25+
)
26+
adapter = HTTPAdapter(max_retries=retry_strategy)
27+
adapter.max_retries.respect_retry_after_header = False
28+
29+
session = requests.Session()
30+
session.mount("https://", adapter)
31+
session.mount("http://", adapter)
32+
return session
33+
34+
def fetch_content(self):
35+
"""Fetch the main content from the URL."""
36+
try:
37+
self.main_response = self.session.get(
38+
self.url, verify=False, timeout=30, headers=self.headers
39+
)
40+
print(f'URL fetched: {self.url}')
41+
return self.main_response
42+
except requests.RequestException as e:
43+
print(f"Failed to fetch the URL: {e}")
44+
return None
45+
46+
def parse_content(self):
47+
"""Parse the fetched HTML content."""
48+
if not self.main_response:
49+
print("No response available to parse.")
50+
return []
51+
52+
main_soup = BeautifulSoup(self.main_response.content, 'html.parser')
53+
datas = main_soup.find('main', {'id': 'main'})
54+
if not datas:
55+
print("No 'main' element found.")
56+
return []
57+
58+
all_tag = datas.find_all(['h1', 'h2', 'h3', 'p', 'blockquote', 'ul'])
59+
each_title_data = {}
60+
61+
for tag in all_tag:
62+
if tag.name in ['h1', 'h2']:
63+
if each_title_data:
64+
self.all_page_data.append(each_title_data)
65+
each_title_data = {}
66+
each_title_data['metadata'] = tag.text.strip()
67+
68+
elif tag.name == 'h3':
69+
if tag.text.strip() == 'Resources':
70+
each_title_data[tag.text.strip()] = ''
71+
else:
72+
if each_title_data:
73+
self.all_page_data.append(each_title_data)
74+
each_title_data = {}
75+
each_title_data['metadata'] = tag.text.strip()
76+
77+
elif tag.name in ['p', 'blockquote']:
78+
num = len(each_title_data)
79+
key = f'content {num}'
80+
if tag.text.strip():
81+
each_title_data[key] = tag.text.strip()
82+
83+
elif tag.name == 'ul':
84+
text = ' '.join(
85+
li.text.strip()
86+
for li in tag.find_all('li', {'class': 'mdx-lists_listItem__nkqhg'})
87+
)
88+
if 'Resources' in each_title_data:
89+
each_title_data['Resources'] = text
90+
else:
91+
num = len(each_title_data)
92+
key = f'content {num}'
93+
if text:
94+
each_title_data[key] = text
95+
96+
if each_title_data:
97+
self.all_page_data.append(each_title_data)
98+
99+
return self.all_page_data
100+
101+
def get_data(self):
102+
"""Main method to fetch and parse content."""
103+
self.fetch_content()
104+
return self.parse_content()
105+

crawl/main.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
2+
import argparse
3+
import csv
4+
import logging
5+
import requests
6+
from bs4 import BeautifulSoup
7+
from requests.adapters import HTTPAdapter
8+
from requests.packages.urllib3.util.retry import Retry
9+
from content_parser import WebContentParser
10+
11+
12+
def setup_logging():
13+
logging.basicConfig(
14+
level=logging.INFO,
15+
format='%(asctime)s - %(levelname)s - %(message)s',
16+
handlers=[logging.StreamHandler()]
17+
)
18+
19+
20+
def setup_http_session():
21+
retry_strategy = Retry(
22+
total=5,
23+
backoff_factor=8,
24+
)
25+
adapter = HTTPAdapter(max_retries=retry_strategy)
26+
adapter.max_retries.respect_retry_after_header = False
27+
session = requests.Session()
28+
session.mount("https://", adapter)
29+
session.mount("http://", adapter)
30+
return session
31+
32+
33+
def process_urls(file_path, save_result):
34+
http = setup_http_session()
35+
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
36+
37+
with open(file_path, 'r') as file:
38+
csv_reader = csv.reader(file)
39+
for row in csv_reader:
40+
if row: # Check if the row is not empty
41+
main_url = row[0]
42+
try:
43+
main_response = http.get(main_url, verify=False, timeout=30, headers=headers)
44+
logging.info(f'Fetched URL: {main_url}')
45+
except requests.RequestException as e:
46+
logging.error(f"Failed to fetch URL {main_url}: {e}")
47+
continue
48+
49+
main_soup = BeautifulSoup(main_response.content, 'html.parser')
50+
products = main_soup.find('div', {'class': 'marketing-content_root__DE3hU'}).find_all('div', {'class': 'card-grid-block_root__yDdm_'})
51+
logging.info(f'Found {len(products)} products on page: {main_url}')
52+
all_data = []
53+
for product in products:
54+
# Get org title
55+
title = product.find('h2').text
56+
sub_content_link=[]
57+
all_sub_title = product.find_all('li')
58+
for res in all_sub_title:
59+
sub_part_content = {}
60+
sub_part_content['main_title'] = title
61+
sub_title = res.find('span', {'class': 'card-title_text__F97Wj'}).get_text()
62+
sub_part_content['sub_title'] = sub_title
63+
sub_title_link = 'https://developer.hashicorp.com' + res.find('a').attrs['href']
64+
sub_part_content['sub_title_link'] = sub_title_link
65+
66+
parser = WebContentParser(sub_title_link)
67+
data = parser.get_data()
68+
sub_part_content['all_data_info'] = data
69+
70+
logging.info(f'Parsed content for sub-title: {sub_title}')
71+
sub_content_link.append(sub_part_content)
72+
all_data.append(sub_content_link)
73+
if save_result:
74+
# Logic to save sub_part_content goes here (e.g., writing to a file or database)
75+
logging.info(f'Saving result for: {all_data}')
76+
else:
77+
print(all_data)
78+
79+
80+
def main():
81+
setup_logging()
82+
83+
parser = argparse.ArgumentParser(description='Process URLs from a CSV file.')
84+
parser.add_argument('--csv_path', type=str, default='./urls.csv', help='Path to the CSV file containing URLs')
85+
parser.add_argument('--save_result', type=bool, default=False, help='Flag to indicate if the results should be saved')
86+
args = parser.parse_args()
87+
88+
process_urls(args.csv_path, args.save_result)
89+
90+
91+
if __name__ == '__main__':
92+
main()

crawl/readme.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Documentation for Web Content Scraper
2+
3+
## Overview
4+
This script is designed to scrape data from a list of URLs provided in a CSV file. It fetches the content, extracts specific product information, and logs the operations performed. Optionally, the extracted content can also be saved. The script utilizes various libraries such as `requests`, `BeautifulSoup`, and `argparse` to ensure efficient and robust operation.
5+
6+
## Prerequisites
7+
Make sure the following Python packages are installed:
8+
- `requests`
9+
- `beautifulsoup4`
10+
- `urllib3`
11+
12+
To install the dependencies, run the following command:
13+
```sh
14+
pip install requests beautifulsoup4
15+
```
16+
## How to Use
17+
Arguments
18+
The script accepts command-line arguments that allow customization of behavior:
19+
--csv_path: The path to the CSV file containing URLs to scrape. The default value is ./urls.csv.
20+
--save_result: A boolean flag indicating whether to save the scraped results. The default value is False.
21+
## Running the Script
22+
You can run the script by using the following command:
23+
24+
```sh
25+
Copy code
26+
python main.py --csv_path <path_to_csv> --save_result <True/False>
27+
```
28+
For example:
29+
```sh
30+
Copy code
31+
python main.py --csv_path ./urls.csv --save_result True
32+
```
33+
## CSV File Format
34+
The CSV file should contain a list of URLs, with each URL on a new line. Here is an example:
35+
```
36+
https://example.com/page1
37+
https://example.com/page2
38+
```
39+

crawl/urls.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
https://developer.hashicorp.com/terraform/docs

0 commit comments

Comments
 (0)