Skip to content

Commit 5bc2431

Browse files
committed
zhihu
1 parent cccedf4 commit 5bc2431

5 files changed

Lines changed: 20 additions & 50 deletions

File tree

PythonSpider/.spyproject/workspace.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,5 @@ save_non_project_files = False
66

77
[main]
88
version = 0.1.0
9-
recent_files = ['C:\\Users\\Administrator\\.spyder-py3\\temp.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\SimpleCrawler.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\followers.json']
9+
recent_files = ['C:\\Users\\Administrator\\.spyder-py3\\temp.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\SimpleCrawler.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\followers.json', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\spider\\bs4quickstart.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\spider\\urllib.requestDemo1.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\spider\\bs4WangYiYunToExcel2.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\spider\\bs4WangYiYunToExcel.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\spider\\bs4WangYiYun.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\spider\\bs4Meizitu.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\spider\\bs4JokeToExcel.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\usecookie.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\cookie.txt', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\spider\\requestsTuicool.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\spider\\requestsTuicool2.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\spider\\urllib.requestTuicool.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\spider\\urllib.requestTuicool2.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\zhihu.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\selenium\\firstDemo.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\selenium\\geckodriver.log', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\selenium\\xiaomiShequ.py', 'D:\\Anaconda3\\lib\\site-packages\\selenium\\webdriver\\__init__.py', 'E:\\GitHubWorkplace\\Python\\PythonSpider\\selenium\\csdn.py']
1010

PythonSpider/cookie.txt

Whitespace-only changes.

PythonSpider/selenium/zhihu.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Sat Mar 17 18:30:06 2018
4+
5+
@author: Administrator
6+
"""
7+
8+
from selenium import webdriver
9+
## 创建浏览器对象
10+
browser = webdriver.Firefox()
11+
## 打开小米社区网站
12+
browser.get('https://www.zhihu.com/signin?next=%2F')
13+
browser.find_element_by_xpath("/html/body/div[1]/div/main/div/div/div/div[2]/div[1]/form/div[1]/div[2]/div[1]/input").clear()#清空输入框
14+
browser.find_element_by_xpath("/html/body/div[1]/div/main/div/div/div/div[2]/div[1]/form/div[1]/div[2]/div[1]/input").send_keys("18163138155")#输入账号
15+
browser.find_element_by_xpath("/html/body/div[1]/div/main/div/div/div/div[2]/div[1]/form/div[2]/div/div[1]/input").clear()#清空输入框
16+
browser.find_element_by_xpath("/html/body/div[1]/div/main/div/div/div/div[2]/div[1]/form/div[2]/div/div[1]/input").send_keys("ks1996721kr")#输入密码
17+
browser.find_element_by_xpath("/html/body/div[1]/div/main/div/div/div/div[2]/div[1]/form/button").click()#登录

PythonSpider/usecookie.py

Lines changed: 0 additions & 49 deletions
This file was deleted.

PythonSpider/zhihu.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,6 @@
44
55
@author: Administrator
66
"""
7+
#!/usr/bin/env python # -*- coding: utf-8 -*- import requests try: import cookielib except: import http.cookiejar as cookielib import re import time import os.path try: from PIL import Image except: pass from bs4 import BeautifulSoup # 构造 Request headers agent = 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0' headers = { "Host": "www.zhihu.com", "Referer": "https://www.zhihu.com/", 'User-Agent': agent } # 使用登录cookie信息 session = requests.session() session.cookies = cookielib.LWPCookieJar(filename='cookies') try: session.cookies.load(ignore_discard=True) except: print("Cookie 未能加载") def get_xsrf(): '''_xsrf 是一个动态变化的参数''' index_url = 'https://www.zhihu.com' # 获取登录时需要用到的_xsrf index_page = session.get(index_url, headers=headers) html = index_page.text pattern = r'name="_xsrf" value="(.*?)"' # 这里的_xsrf 返回的是一个list _xsrf = re.findall(pattern, html) return _xsrf[0] # 获取验证码 def get_captcha(): t = str(int(time.time() * 1000)) captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login" r = session.get(captcha_url, headers=headers) with open('captcha.jpg', 'wb') as f: f.write(r.content) f.close() # 用pillow 的 Image 显示验证码 # 如果没有安装 pillow 到源代码所在的目录去找到验证码然后手动输入 try: im = Image.open('captcha.jpg') im.show() im.close() except: print(u'请到 %s 目录找到captcha.jpg 手动输入' % os.path.abspath('captcha.jpg')) captcha = input("please input the captcha\n>") return captcha def isLogin(): # 通过查看用户个人信息来判断是否已经登录 url = "https://www.zhihu.com/settings/profile" login_code = session.get(url, headers=headers, allow_redirects=False).status_code if login_code == 200: return True else: return False def login(secret, account): # 通过输入的用户名判断是否是手机号 if re.match(r"^1\d{10}$", account): print("手机号登录 \n") post_url = 'https://www.zhihu.com/login/phone_num' postdata = { '_xsrf': get_xsrf(), 'password': secret, 'remember_me': 'true', 'phone_num': account, } else: if "@" in account: print("邮箱登录 \n") else: print("你的账号输入有问题,请重新登录") return 0 post_url = 'https://www.zhihu.com/login/email' postdata = { '_xsrf': get_xsrf(), 'password': secret, 'remember_me': 'true', 'email': account, } try: # 不需要验证码直接登录成功 login_page = session.post(post_url, data=postdata, headers=headers) login_code = login_page.text print(login_page.status_code) print(login_code) except: # 需要输入验证码后才能登录成功 postdata["captcha"] = get_captcha() login_page = session.post(post_url, data=postdata, headers=headers) login_code = eval(login_page.text) print(login_code['msg']) session.cookies.save() try: input = raw_input except: pass ## 將main的問題列表輸出在shell上面 def getPageQuestion(url2): mainpage = session.get(url2, headers=headers) soup=BeautifulSoup(mainpage.text,'html.parser') tags=soup.find_all("a",class_="question_link") #print tags for tag in tags: print tag.string # 將main頁面上面的問題的回答的摘要輸出在shell上面 def getPageAnswerAbstract(url2): mainpage=session.get(url2,headers=headers) soup=BeautifulSoup(mainpage.text,'html.parser') tags=soup.find_all('div',class_='zh-summary summary clearfix') for tag in tags: # print tag print tag.get_text() print '詳細內容的鏈接 : ',tag.find('a').get('href') def getPageALL(url2): #mainpage=session.get(url2,headers=headers) #soup=BeautifulSoup(mainpage.text,'html.parser') #tags=soup.find_all('div',class_='feed-item-inner') #print "def getpageall " mainpage=session.get(url2,headers=headers) soup=BeautifulSoup(mainpage.text,'html.parser') tags=soup.find_all('div',class_='feed-content') for tag in tags: #print tag print tag.find('a',class_='question_link').get_text() # 這裏有一點問題 bs 還是用的不是太熟練 #print tag.find('a',class_='zh-summary summary clearfix').get_text() #print tag.find('div',class_='zh-summary summary clearfix').get_text() if __name__ == '__main__': if isLogin(): print('您已经登录') url2='https://www.zhihu.com' # getPageQuestion(url2) #getPageAnswerAbstract(url2) getPageALL(url2) else: account = input('请输入你的用户名\n> ') secret = input("请输入你的密码\n> ") login(secret, account)
8+
79

0 commit comments

Comments
 (0)