python学习日记(四)

有道翻译的爬虫+煎蛋网图片下载

先介绍主要的包: urllib.request, urllib.parse

1
urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)

主体部分

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import urllib.request
import urllib.parse
import json
#有道翻译的url需要删去_o
content=input("请输入需要翻译的英语")
url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
data={}
data['i']=content
data['from']= 'AUTO'
data['to']='AUTO'
data['smartresult']='dict'
data['client']='fanyideskweb'
data['salt']='15717288199466'
data['sign']='12fc8fd46e9075f2c78cd47c4a6dbb20'
data['ts']='1571728819946'
data['bv']='e218a051a7336600dfed880d272c7d6f'
data['doctype']='json'
data['version']='2.1'
data['keyfrom']='fanyi.web'
data['action']= 'FY_BY_REALTlME'

电脑访问

1
2
3
4
5
data=urllib.parse.urlencode(data).encode('utf-8')#解析,编码成utf-8形式,以便传给服务器
response=urllib.request.urlopen(url,data)#向网站发送请求,得到'utf-8'形式的response
html=response.read().decode('utf-8')#读取,并解码成json格式
target=json.loads(html)#loads装载得到字典形式
print("得到翻译结果如下:%s" %(target['translateResult'][0][0]['tgt']))

更改Header伪装成人类用户,先介绍Request()

1
2
#对Request的参数可通过add_header(key,val)#添加header
class urllib.request.Request(url, data=None,headers={}, origin_req_host=None, unverifiable=False, method=None)
1
2
3
4
5
6
7
8
9
#head改变user-Agent选项,防止被服务器屏蔽
head={}
head['User-Agent']='Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14'

req=urllib.request.Request(url,data,head)#??
response=urllib.request.urlopen(req)#向网站发送POST请求,得到'utf-8'形式的response
html=response.read().decode('utf-8')#读取,并解码成json格式
target=json.loads(html)#loads装载得到字典形式
print("得到翻译结果如下:%s" %(target['translateResult'][0][0]['tgt']))

设置延时再访问(time.sleep())

1
2
3
4
5
6
7
8
9
import time
while True:
#有道翻译的url需要删去_o
content=input("您需要翻译的文本(输入q!退出程序):")
if content=='q!':
print('程序已退出')
break
......
time.sleep(5)#延时五秒钟在进行下一个访问操作

代理(urllib.request)

1、参数是一个字典{‘类型’,’代理ip:端口号’}

1
proxy_support=urllib.request.ProxyHandler({})

2、定制、创建一个opener(个人理解网址打开器)

1
opener=urllib.request.build_opener(proxy_support)

3、安装opener

1
2
urllib.request.install_opener(opener)
#以后urllib.request.urlopen()都能自动使用代理

4、调用opener

1
opener.open(url)

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import urllib.request
import random
url='https://www.jiumodiary.com'
#随机选一个代理ip,获取网址https://ip.ihuan.me/
iplist=['47.75.177.173:80','150.109.55.190','39.108.238.97']
proxy_support=urllib.request.ProxyHandler({'http':random.choice(iplist)})
#伪装成用户浏览
opener=urllib.request.build_opener(proxy_support)
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36')]

urllib.request.install_opener(opener)#安装opener,自动代理

response=urllib.request.urlopen(url)
html=response.read().decode('utf-8')

print(html)

爬取网站图片练习

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import urllib.request
import os
import random

#代理访问url,返回'utf-8'编码格式的html,防止图片(编码格式'base64')错误??
def url_open(url):
#iplist=['39.137.69.6','47.110.130.152','36.25.243.51','183.146.213.157']
#proxy_support=urllib.request.ProxyHandler({'http':random.choice(iplist)})
proxy_support=urllib.request.ProxyHandler({'http':'39.137.69.7'})
opener=urllib.request.build_opener(proxy_support)
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36')]
urllib.request.install_opener(opener)#安装opener,自动代理
response=urllib.request.urlopen(url)
html=response.read()
return html

#返回不同页图片的网址标识
def get_page(url):
html=url_open(url).decode('utf-8')
#a=html.find('current-comment-page')+len('current-comment-page">[')
#b=html.find(']',a)
a=html.find('href="//jandan.net/ooxx/')+len('href="//jandan.net/ooxx/')
b=html.find('==',a,a+255)
print(html[a:b])
return html[a:b]

#查找当前页的所有jpg格式图片地址并返回
def find_imgs(url):
html=url_open(url).decode('utf-8')
img_addrs=[]
a=html.find('img src=')
while a!=-1:
b=html.find('.jpg',a,a+255)#一个图片的网址一般不会超过255
if b!=1:
img_addrs.append(html[a+9:b+4])
else:
b=a+9
#查找下一个图片
a=html.find('img src=',b)
for each in img_addrs:
print(each)
return img_addrs

#输入存有图片网址的列表,保存这些照片
def save_imgs(folder,img_addrs):
for each in img_addrs:
filename=each.split('/')[-1]#分割斜杠,取字典的倒数第一项
#filename=each
with open(filename,'wb') as f:
img=url_open('http:'+each)
f.write(img)

def download_picture(folder='mm_picture',pages=2):###原来page=10
#os.mkdir(folder)
os.chdir(folder)

url='http://jandan.net/ooxx/'
page_str=str(get_page(url))

for i in range(pages):
#page_url = url+'page-' + str(page_num) +'#comments'有点问题
#举个例子
page_url = url+ str(page_str)+'==#comments'########
print('图片页码网址为:'+page_url)
#page_url = url+'MjAxOTEwMjct' + 'NA'+'==#comments'
img_addrs = find_imgs(page_url)
print('没问题')
save_imgs(folder,img_addrs)

if __name__=='__main__':
download_picture()

运行结果

运行结果

使用scrapy框架爬取http://www.dmoztools.net

生成scrapy_test项目

1
2
pip install scrapy
scrapy startproject scrapy_test

生成的默认文件结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
scrapy_test
│ scrapy.cfg

└──scrapy_test
│ item.py
│ middlewares.py
│ pipelines.py
│ setting.py

├── spiders
│ │ __init__.py
│ │
│ └─── __pycache__
└─ __pycache__

item.py

1
2
3
4
5
6
7
8
9
import scrapy


class DmozItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
desc = scrapy.Field()

domz.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import scrapy

#继承spider类的dmoz爬虫类,提交给Scheduler,Request给downloader
class DmozSpider(scrapy.Spider):
name="dmoz"
allowed_dpmains=['dmoz.org']#爬取范围
start_urls=[
"http://www.dmoztools.net/Computers/Programming/Languages/Python/Books/",
"http://www.dmoztools.net/Computers/Programming/Languages/Python/Resources/"
]

#分析的方法,回调函数,获得response时启动
def parse(self,response):
filename=response.url.split('/')[-2]
with open(filename,'wb') as f:
f.write(response.body)

cmd中scrapy genspider 项目名称 域名
scrapy genspider dmoz dmoz.org生成相应代码

读取

基于Xpath和CSS的表达式机制:Scrapy Selectors

xpath(),css():传入各自表达式,返回对应的selector list

extract():序列化节点为Unicode字符串,返回list

re():根据正则表达式,返回list