Skip to content

Commit c0c16aa

Browse files
committed
Spider
微信,网易云等等爬虫案例
1 parent fc0f651 commit c0c16aa

13 files changed

Lines changed: 931 additions & 1 deletion

PythonDemo/.spyproject/workspace.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,5 @@ save_non_project_files = False
66

77
[main]
88
version = 0.1.0
9-
recent_files = ['C:\\Users\\Administrator\\.spyder-py3\\temp.py', 'C:\\Users\\Administrator\\Desktop\\PythonDemo\\test.py', 'C:\\Users\\Administrator\\Desktop\\PythonDemo\\test2.py', 'D:\\Anaconda3\\lib\\site-packages\\ncmbot\\core.py', 'D:\\Anaconda3\\lib\\site-packages\\psutil\\__init__.py']
9+
recent_files = ['E:\\GitHubWorkplace\\Python\\PythonDemo\\spider\\bs4JokeToExcel.py', 'E:\\GitHubWorkplace\\Python\\PythonDemo\\spider\\bs4WangYiYunToExcel.py', 'E:\\GitHubWorkplace\\Python\\PythonDemo\\saveToExcel\\xlwtDemo.py', 'E:\\GitHubWorkplace\\Python\\PythonDemo\\saveToExcel\\xwltDemo2.py', 'E:\\GitHubWorkplace\\Python\\PythonDemo\\spider\\bs4WangYiYunToExcel2.py', 'E:\\GitHubWorkplace\\Python\\PythonDemo\\spider\\bs4quickstart.py', 'E:\\GitHubWorkplace\\Python\\PythonDemo\\spider\\bs4Meizitu.py', 'E:\\GitHubWorkplace\\Python\\PythonDemo\\spider\\urllib.requestDemo1.py', 'E:\\GitHubWorkplace\\Python\\PythonDemo\\spider\\weixin.py', 'E:\\GitHubWorkplace\\Python\\PythonDemo\\spider\\weixinItchat.py']
1010

PythonDemo/saveToExcel/xlwtDemo.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Tue Mar 13 14:40:09 2018
4+
5+
@author: Administrator
6+
"""
7+
8+
import xlwt
9+
10+
11+
def set_style(name, height, bold = False):
12+
style = xlwt.XFStyle() #初始化样式
13+
14+
font = xlwt.Font() #为样式创建字体
15+
font.name = name
16+
font.bold = bold
17+
font.color_index = 4
18+
font.height = height
19+
20+
style.font = font
21+
return style
22+
23+
24+
def write_excel():
25+
#创建工作簿
26+
workbook = xlwt.Workbook(encoding='utf-8')
27+
#创建sheet
28+
data_sheet = workbook.add_sheet('demo')
29+
row0 = [u'歌单介绍', u'歌曲链接地址', '歌曲播放次数', '收藏次数','评论次数']
30+
row1 = [u'测试', '15:50:33-15:52:14', '22706', 4190202,'sss']
31+
data_sheet.col(0).width = 9999#设置单元格宽度
32+
data_sheet.col(1).width = 9999#设置单元格宽度
33+
data_sheet.col(2).width = 4444#设置单元格宽度
34+
data_sheet.col(3).width = 3333#设置单元格宽度
35+
data_sheet.col(4).width = 3333#设置单元格宽度
36+
#生成第一行和第二行
37+
for i in range(len(row0)):
38+
data_sheet.write(0, i, row0[i], set_style('Times New Roman', 220, True))
39+
data_sheet.write(1, i, row1[i], set_style('Times New Roman', 220, True))
40+
41+
#保存文件
42+
workbook.save('C:/Users/Administrator/Desktop/xlwtDemo.xls')
43+
44+
45+
if __name__ == '__main__':
46+
write_excel()
47+
print (u'创建demo.xlsx文件成功' )
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Tue Mar 13 15:39:00 2018
4+
5+
@author: Administrator
6+
"""
7+
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Tue Mar 13 12:58:54 2018
4+
http://blog.csdn.net/weixin_39198406/article/details/73332565
5+
@author: Administrator
6+
"""
7+
8+
#抓取糗事百科笑话的脚本
9+
import urllib.request
10+
from bs4 import BeautifulSoup
11+
import xlwt #写入文件
12+
import time
13+
14+
#返回文本式的html
15+
def getHTML(url):
16+
#给头文件伪装成浏览器访问
17+
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
18+
req = urllib.request.Request(url, headers=headers)
19+
return urllib.request.urlopen(req).read()
20+
21+
#返回一个bs4_url对象
22+
def creatSoup(url):
23+
html_text = getHTML(url)
24+
soup_0 = BeautifulSoup(html_text,'html5lib')
25+
return soup_0
26+
27+
#新建Excel文件和其中的一个sheet,注意传的参数是字符串格式,新建完在空间中打开,直接使用write写入数据
28+
def creatExcelAndSheet(sheetName):
29+
#新建一个excel文件
30+
file = xlwt.Workbook(encoding = 'utf-8', style_compression = 0)
31+
#新建一个sheet
32+
sheet = file.add_sheet(sheetName)
33+
#返回打开的sheet对象
34+
return sheet,file
35+
36+
#执行写入Excel的程序。参数含义 a-选择写入行,b-选择写入列,c-选择写入的内容(字符串类型)
37+
def writeToSheet(a,b,c):
38+
sheet.write(a,b,c)
39+
40+
#抓取结束的提示信息,分别是页循环次数和内容循环次数,由于结束之前页和内容循环数还会+1.所以summary要-1
41+
def summaryAllContent(a,b,url):
42+
print('提示:抓取结束,无更多内容!')
43+
print('------------------Summary------------------')
44+
print('您抓取的网址为%s'%url)
45+
print('共抓取 %d页 共 %d个内容'%(a-1,b-1))
46+
print('-------------------------------------------')
47+
48+
#得到每一条内容的处理函数,根据不同的html需要修改
49+
def getEachContent(eachContent):
50+
a = eachContent.select('div')[0]
51+
b = a.select('span')[0]
52+
sss = ''
53+
for s in b.strings:
54+
sss+=s
55+
return sss
56+
57+
sheet,file = creatExcelAndSheet('data')
58+
59+
i = 1
60+
k = 1
61+
while i <2:
62+
63+
# url = 'https://www.qiushibaike.com/8hr/page/1/?s=4991834' 根据url多页的特性,找到翻页的一个参数
64+
url = 'https://www.qiushibaike.com/8hr/page/' + str(i) + '/?s=4991834'
65+
soup = creatSoup(url)
66+
a_soup = soup.select('a[class=contentHerf]') #根据关键字取得按list存放的内容
67+
contentLen = len(a_soup) #取得列表长度
68+
print('Info: 第%d页有%d个笑话'%(i,contentLen))
69+
70+
for eachContent in a_soup:
71+
sss = getEachContent(eachContent)
72+
writeToSheet(k,0,k)
73+
writeToSheet(k,1,sss)
74+
print('正在获取第%d个内容...Done'%k)
75+
time.sleep(0.05)
76+
k+=1
77+
78+
print('提示: 正在获取下一页内容...')
79+
i += 1
80+
time.sleep(3)
81+
82+
summaryAllContent(i,k,url)
83+
file.save('C:/Users/Administrator/Desktop/糗事百科Data.xls') #这里写要保存的路径

PythonDemo/spider/bs4Meizitu.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Mon Mar 12 22:20:22 2018
4+
5+
@author: Administrator
6+
@description: BeautifulSoup抓取美女图片
7+
"""
8+
9+
import requests
10+
from bs4 import BeautifulSoup
11+
import os,re
12+
#导入所需要的模块
13+
class mzitu():
14+
def all_url(self, url):
15+
html = self.request(url)##
16+
all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find_all('a', href=re.compile('[0~9]'))
17+
for a in all_a:
18+
title = a.get_text()
19+
print('------开始保存:', title)
20+
path = str(title).replace("?", '_') ##替换掉带有的?
21+
self.mkdir(path) ##调用mkdir函数创建文件夹!这儿path代表的是标题title
22+
href = a['href']
23+
self.html(href)
24+
25+
def html(self, href): ##获得图片的页面地址并保存图片
26+
html = self.request(href)
27+
max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
28+
#这个上面有提到
29+
for page in range(1, int(max_span) + 1):
30+
page_url = href + '/' + str(page)
31+
self.img(page_url) ##调用img函数
32+
33+
def img(self, page_url): ##处理图片页面地址获得图片的实际地址
34+
img_html = self.request(page_url)
35+
img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
36+
self.save(img_url)
37+
38+
def save(self, img_url): ##保存图片
39+
name = img_url[-9:-4]
40+
img = self.request(img_url)
41+
f = open(name + '.jpg', 'ab')
42+
f.write(img.content)
43+
f.close()
44+
45+
def mkdir(self, path): ##创建文件夹
46+
path = path.strip()
47+
isExists = os.path.exists(os.path.join("E:\mzitu2", path))
48+
if not isExists:
49+
print('建了一个名字叫做', path, '的文件夹!')
50+
os.makedirs(os.path.join("E:\mzitu2", path))
51+
os.chdir(os.path.join("E:\mzitu2", path)) ##切换到目录
52+
return True
53+
else:
54+
print( path, '文件夹已经存在了!')
55+
return False
56+
57+
def request(self, url): ##这个函数获取网页的response 然后返回
58+
headers = {
59+
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
60+
'referer': "http://www.mzitu.com/100260/2" #伪造一个访问来源
61+
}
62+
content = requests.get(url, headers=headers)
63+
return content
64+
#设置启动函数
65+
def main():
66+
Mzitu = mzitu() ##实例化
67+
Mzitu.all_url('http://www.mzitu.com/all') ##给函数all_url传入参数
68+
69+
main()

PythonDemo/spider/bs4WangYiYun.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Tue Mar 13 12:53:20 2018
4+
5+
@author: Administrator
6+
"""
7+
8+
# 爬取网易云音乐的爬虫
9+
# -*- coding: utf-8 -*-
10+
from bs4 import BeautifulSoup
11+
import urllib.request
12+
import urllib
13+
14+
#获取网页
15+
def gethtml(url, headers={}):
16+
req = urllib.request.Request(url, headers=headers)
17+
response = urllib.request.urlopen(req)
18+
content = response.read().decode('utf-8')
19+
response.close()
20+
return content
21+
22+
#解析音乐列表网页
23+
def parsehtmlMusicList(html):
24+
soup = BeautifulSoup(html, 'lxml')
25+
list_pic = soup.select('ul#m-pl-container li div img')
26+
list_nameUrl = soup.select('ul#m-pl-container li div a.msk')
27+
list_num = soup.select('div.bottom span.nb')
28+
list_author = soup.select('ul#m-pl-container li p a')
29+
n = 0
30+
length = len(list_pic)
31+
while n < length:
32+
print('歌单图片:'+list_pic[n]['src']+'\n\n')
33+
print('歌单名称:'+list_nameUrl[n]['title']+'\n\n歌单地址:'+list_nameUrl[n]['href']+'\n\n')
34+
print('歌单播放量:'+list_num[n].text+'\n\n')
35+
print('歌单作者:'+list_author[n]['title']+'\n\n作者主页:'+list_author[n]['href']+'\n\n\n')
36+
n += 1
37+
38+
39+
url = 'http://music.163.com/discover/playlist'
40+
url = gethtml(url, headers={
41+
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
42+
'Host': 'music.163.com'
43+
})
44+
parsehtmlMusicList(url)
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Tue Mar 13 15:11:32 2018
4+
5+
@author: Administrator
6+
"""
7+
8+
from bs4 import BeautifulSoup
9+
import urllib.request
10+
import urllib
11+
import xlwt
12+
13+
#获取网页
14+
def gethtml(url, headers={}):
15+
req = urllib.request.Request(url, headers=headers)
16+
response = urllib.request.urlopen(req)
17+
content = response.read().decode('utf-8')
18+
response.close()
19+
return content
20+
def set_style(name, height, bold = False):
21+
style = xlwt.XFStyle() #初始化样式
22+
23+
font = xlwt.Font() #为样式创建字体
24+
font.name = name
25+
font.bold = bold
26+
font.color_index = 4
27+
font.height = height
28+
29+
style.font = font
30+
return style
31+
32+
#解析音乐列表网页
33+
def parsehtmlMusicList(html):
34+
soup = BeautifulSoup(html, 'lxml')
35+
list_pic = soup.select('ul#m-pl-container li div img')
36+
list_nameUrl = soup.select('ul#m-pl-container li div a.msk')
37+
list_num = soup.select('div.bottom span.nb')
38+
list_author = soup.select('ul#m-pl-container li p a')
39+
n = 0
40+
length = len(list_pic)
41+
#创建工作簿
42+
workbook = xlwt.Workbook(encoding='utf-8')
43+
#创建sheet
44+
data_sheet = workbook.add_sheet('demo')
45+
row0 = [u'歌单介绍', u'歌曲链接地址', u'歌曲播放次数', u'歌单作者']
46+
data_sheet.col(0).width = 9999#设置单元格宽度
47+
data_sheet.col(1).width = 9999#设置单元格宽度
48+
data_sheet.col(2).width = 4444#设置单元格宽度
49+
data_sheet.col(3).width = 3333#设置单元格宽度
50+
data_sheet.col(4).width = 3333#设置单元格宽度
51+
#生成第一行和第二行
52+
for i in range(len(row0)):
53+
data_sheet.write(0, i, row0[i], set_style('Times New Roman', 220, True))
54+
while n < length:
55+
description=list_nameUrl[n]['title']#歌单介绍
56+
songhref= list_nameUrl[n]['href']
57+
num=list_num[n].text#歌曲播放量
58+
#picture=list_pic[n]['src']#图片链接地址
59+
author=list_author[n]['title']#歌单作者
60+
row=[description, songhref, num, author]
61+
#print('歌单图片:'+list_pic[n]['src']+'\n\n')
62+
#print('歌单名称:'+list_nameUrl[n]['title']+'\n\n歌单地址:'+list_nameUrl[n]['href']+'\n\n')
63+
#print('歌单播放量:'+list_num[n].text+'\n\n')
64+
#print('歌单作者:'+list_author[n]['title']+'\n\n作者主页:'+list_author[n]['href']+'\n\n\n')
65+
n += 1
66+
for i in range(len(row)):
67+
data_sheet.write(n, i, row[i], set_style('Times New Roman', 220, True))
68+
workbook.save('C:/Users/Administrator/Desktop/xlwtDemo.xls')
69+
url = 'http://music.163.com/discover/playlist'
70+
url = gethtml(url, headers={
71+
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
72+
'Host': 'music.163.com'
73+
})
74+
parsehtmlMusicList(url)
75+
76+

0 commit comments

Comments
 (0)