-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.py
More file actions
176 lines (166 loc) · 5.85 KB
/
crawler.py
File metadata and controls
176 lines (166 loc) · 5.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env python3
# page_id_finder_adv.py
import sys
import re
import requests
from bs4 import BeautifulSoup
import time
# Selenium import (sadece gerektiğinde)
try:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
SELENIUM_AVAILABLE = True
except Exception:
SELENIUM_AVAILABLE = False
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
}
def debug_print(msg):
print(f"[+] {msg}")
def fetch_requests(url, timeout=10):
debug_print(f"requests GET -> {url}")
try:
r = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True)
return r
except Exception as e:
debug_print(f"requests error: {e}")
return None
def looks_like_html(response):
if response is None:
return False
ct = response.headers.get('Content-Type','').lower()
text = (response.text or "").strip()[:200].lower()
# basic checks
if 'text/html' in ct:
return True
if text.startswith('<!doctype html') or '<html' in text:
return True
return False
def find_ids_in_html(html):
ids = set()
soup = BeautifulSoup(html, 'html.parser')
# body class içinde page-id
body = soup.find('body')
if body and body.get('class'):
for c in body.get('class'):
m = re.search(r'page-id-(\d+)', c)
if m:
ids.add(m.group(1))
m2 = re.search(r'postid-(\d+)', c)
if m2:
ids.add(m2.group(1))
# hidden comment_post_ID
for inp in soup.find_all('input', {'name':'comment_post_ID'}):
if inp.get('value'):
ids.add(inp['value'])
# id="post-42" veya data-post-id
for tag in soup.find_all(True):
for attr in ('id','data-post-id','data-id','data-postid'):
if attr in tag.attrs:
v = str(tag[attr])
m = re.search(r'(\d{1,7})', v)
if m:
ids.add(m.group(1))
return ids
def try_wp_rest(url):
# build base
try:
parts = url.split('/',3)
base = parts[0] + '//' + parts[2]
except Exception:
return None
slug = url.rstrip('/').split('/')[-1]
api_urls = [
f"{base}/wp-json/wp/v2/pages?slug={slug}",
f"{base}/wp-json/wp/v2/posts?slug={slug}"
]
for api in api_urls:
debug_print(f"Trying WP REST API -> {api}")
try:
r = requests.get(api, headers=HEADERS, timeout=6)
if r.status_code == 200:
j = r.json()
if isinstance(j, list) and len(j)>0 and 'id' in j[0]:
return str(j[0]['id'])
except Exception as e:
debug_print(f"WP REST error: {e}")
return None
def fetch_with_selenium(url, wait=2):
if not SELENIUM_AVAILABLE:
debug_print("Selenium veya webdriver-manager yüklü değil.")
return None
debug_print("Launching headless browser (Selenium)...")
opts = Options()
# headless yeni mod:
opts.add_argument("--headless=new")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-gpu")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--window-size=1920,1080")
# optional: set user-agent
opts.add_argument(f"user-agent={HEADERS['User-Agent']}")
try:
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=opts)
driver.get(url)
time.sleep(wait) # basit bekleme; gerekirse arttır
html = driver.page_source
driver.quit()
return html
except Exception as e:
debug_print(f"Selenium error: {e}")
return None
def main():
if len(sys.argv) != 2:
print("Usage: python page_id_finder_adv.py <URL>")
sys.exit(1)
url = sys.argv[1]
# 1) normal requests
r = fetch_requests(url)
if r is None:
debug_print("No response from requests.")
else:
debug_print(f"HTTP {r.status_code} Content-Type: {r.headers.get('Content-Type')}")
# check WP REST API first (fast)
wp_id = try_wp_rest(url)
if wp_id:
print(f"WP REST API found ID: {wp_id}")
# still continue to parse HTML too
# 2) if requests returned HTML-like content, parse
found_ids = set()
if looks_like_html(r):
debug_print("Response looks like HTML. Parsing...")
found_ids |= find_ids_in_html(r.text)
else:
debug_print("Response does NOT look like HTML or is empty/JS-rendered.")
# 3) if none found, try selenium (JS render)
if not found_ids and not wp_id:
debug_print("Trying Selenium fallback (JS-rendered pages)...")
html2 = fetch_with_selenium(url, wait=3)
if html2:
found_ids |= find_ids_in_html(html2)
# collect results
results = set(found_ids)
if wp_id:
results.add(str(wp_id))
if results:
print("Found possible page IDs: " + ", ".join(sorted(results)))
else:
print("No page IDs found. Debug info:")
if r is None:
print(" - requests returned no response (timeout / network issue).")
else:
print(f" - HTTP status: {r.status_code}")
print(f" - Content-Type: {r.headers.get('Content-Type')}")
head_snip = (r.text or "")[:500].replace('\n',' ')
print(f" - Head snippet: {head_snip[:400]}...")
if not SELENIUM_AVAILABLE:
print(" - Selenium not available; install selenium + webdriver-manager to try JS-rendered pages.")
else:
print(" - Selenium attempted; if still nothing, page likely requires login or blocks bots.")
if __name__ == "__main__":
main()