21.2. Python的网络请求

21.2.1. urllib

Python 的urllib 库其中包含了如下4 个模块。

urllib.request 模块:用于打开和读写URL 资源。
urllib.error 模块:包含了由urllib. request 引发的异常。
urllib.parse 模块:用于解析URL 。
urllib.robotparser 模块:分析robots.txt 文件

在访问互联网资源时主要使用的模块是urllib.request 、urllib.parse 和urllib.error ,
其中最核心的是urllib.request 模块
在urllib.request 模块中访问互联网资源
主要使用urllib.request.urlopen() 函数和
urllib.request.Request 对象,
urllib .request. urlopen() 函数可以用于简单的网络资访问,
而urllib.request.Request 对象可以访问复杂网络资源。

urllib.request.urltetrieve()函数可以把对应的文件下载到本地

下载文件

filename = urllib.request.urlretrieve("https://mbd.baidu.com",filename="test.html")

python2和python3的Urllib库的区别:

python2X  中 import urlparse ------ python3X 使用 import urllib.parse
python2X  中 import urllib2 ------ python3X 使用 import urllib.request,urllib.error
python2X  中 import urllib ------ python3X 使用 import urllib.request,urllib.error,urllib.parse
python2X  中 import urllib2.urlopen ------ python3X 使用 import urllib.request.urlopen
python2X  中 urllib.quote ------ python3X 使用 import urllib.request.quote
python2X  中 urllib2.Request ------ python3X 使用 import urllib.request.Request
python2X  中 cookielib.CookieJar------ python3X 使用 http.CookieJar
python2X  中 import urllib2.urlencode ------ python3X 使用 import urllib.parse.urlencode

python2和python3的urllib区别

#!/usr/bin/env python
#-*- coding:utf8 -*-
# auther; 18793
# Date:2019/8/19 15:03
# filename: urllib-urlopen函数.py

"""
#高级用法
import urllib.request

response = urllib.request.urlopen("http://www.python.org")
print(response.read().decode('utf-8'))
"""

# python3
import urllib.request

response = urllib.request.urlopen("http://www.python.org")
print(response.status)              #200
print(response.getheaders())
print(response.getheader('Server'))     #nginx



#python2
"""
import urllib2
response = urllib2.urlopen("http://www.python.org")
print(response.status)              #200
print(response.getheaders())
print(response.getheader('Server'))     #nginx

"""

代码示例:

# 导入urllib
import urllib.request
# 打开URL
response = urllib.request.urlopen('https://movie.douban.com/', None, 2)
# 读取返回的内容
html = response.read().decode('utf8')
# 写入txt
f = open('code1.txt', 'w', encoding='utf8')
f.write(html)
f.close()
#!/usr/bin/env python
#-*- coding:utf8 -*-
# auther; 18793
# Date:2019/5/22 14:03
# filename: urllib模块.py
import urllib.request

#使用urlopen()函数打开网站。使用with as代码块自动管理资源释放
with urllib.request.urlopen("http://www.sina.com.cn/") as response:
    #读取数据,该数据是字节序列数据
    data = response.read()

    # 将字节序列数据转换为字符串
    html = data.decode()
    print(html)

urllib.request.urlopen()函数可以很轻松地打开一个网站,读取网页信息。

from urllib import request

# urlopen()函数既可以打开对象,也可以是字符串
req = request.Request("http://fanyi.baidu.com")
response = request.urlopen(req)
html = response.read()
html = html.decode("utf-8")
print(html)



if __name__ == '__main__':
    #打开字符串
    res = request.urlopen("http://fanyi.baidu.com")
    html = res.read()
    html = html.decode("utf-8")
    print(html)

获取服务响应信息

发送get请求

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/7/24 14:17
# filename: 获取服务器的响应信息.py
from urllib import request

f = request.urlopen("https://fanyi.baidu.com/")
data = f.read()

# 返回请求HTTP后的状态,reason返回未被响应的原因
print("Status:", f.status, f.reason)

# 返回HTTP响应的头信息
for k, v in f.getheaders():
    print("{}:{}".format(k, v))
#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/7/24 14:23
# filename: urllib中Response对象的一些方法.py
from urllib import request

req = request.Request("https://fanyi.baidu.com/")
response = request.urlopen(req)

#获取url地址
print("geturl打印信息:{}".format(response.geturl()))
print("*" * 100)
#获取响应信息
print("info打印信息;{}".format(response.info()))
print("*" * 100)

#获取响应HTTP状态码
print("getcode打印信息:{}".format(response.getcode()))

获得静态数据

#!/usr/bin/env python
#-*- coding:utf8 -*-
# auther; 18793
# Date:2019/5/22 16:27
# filename: 获得静态数据.py
import urllib.request

url = "http://q.stock.sohu.com/cn/bk_4188.shtml"
req = urllib.request.Request(url)

with urllib.request.urlopen(req) as response:
    data = response.read()
    htmlstr = data.decode('gbk')
    print(htmlstr)

发送post网络请求

import urllib.request
import urllib.parse  # 导入解析模块

# 创建参数
data = bytes(urllib.parse.urlencode({"word": "hello"}), encoding="utf-8")

# 发送post网络请求
response = urllib.request.urlopen("http://httpbin.org/post", data=data)
html = response.read()
print(html)

21.2.2. Downloader下载图片

下载并保存到文件的完整过程如下:

1.调用 requests.get()下载该文件。
2.用'wb'调用 open(),以写二进制的方式打开一个新文件。
3.利用 Respose 对象的 iter_content()方法做循环。
4.在每次迭代中调用 write(),将内容写入该文件。
5.调用 close()关闭该文件。

使用request下载图片

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2020/3/22 22:46
# filename: web下载.py
import requests

response = requests.get("https://cdn.fuhao321.com/uploads/1811/1-1Q12G04221108.jpg")
"""
方式1
"""
try:
    response.raise_for_status()
    with open("image.jpg", "wb") as f:
        for chunk in response.iter_content(100000):
            f.write(chunk)
except Exception as e:
    print(e)


"""
方式2
"""
# if response.status_code == 200:
#     code_image = response.content
#     with open("image.jpg", "wb") as f:
#         f.write(code_image)

使用urllib下载图片

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/5/22 14:08
# filename: urlib模块2.py
import urllib.parse
import urllib.request

url = "http://www.cncrk.com/up/1801/201801051228459384.jpg"
with urllib.request.urlopen(url) as response:
    data = response.read()
    f_name = "download.png"

    with open(f_name, "wb") as f:
        f.write(data)
        print("下载文件成功....")
def writeImage(link):
    """
        作用:将html内容写入到本地
        link:图片连接
    """
    #print "正在保存 " + filename
    headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
    # 文件写入
    request = urllib2.Request(link, headers = headers)
    # 图片原始数据
    image = urllib2.urlopen(request).read()
    # 取出连接后10位做为文件名
    filename = link[-10:]
    # 写入到本地磁盘文件内
    with open(filename, "wb") as f:
        f.write(image)
    print "已经成功下载 "+ filename

21.2.3. 总结下载图片的几种方式

# 方式1
# filename: 01.图片文件下载到本地-方式1.py
from urllib import request

request.urlretrieve(
    "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1565931658646&di=7481c26c3e7334361c58239fdba0b7d3&imgtype=0&src=http%3A%2F%2Fdownhdlogo.yy.com%2Fhdlogo%2F640640%2F640%2F640%2F45%2F1662457261%2Fu1662457261wYlxdOY.jpeg",
    "cat.jpg")

函数原型如下:


urlretrieve(url, filename=None, reporthook=None, data=None)

以天堂图片网为例(http://www.ivsky.com/tupian/ziranfengguang/ ),提取当前网址中的图片链接,并将图片下载到当前目录下。代码如下:

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2020/5/6 13:19
# filename: sample01.py
import os
from urllib import request
from lxml import etree
import requests

Base_path = os.path.dirname(os.path.abspath(os.path.normpath(__file__))) + "/photo"


def mkdir_dir(pathname):
    if not os.path.exists(pathname):
        os.mkdir(pathname)


def Schedule(blocknum, blocksize, totalsize):
    '''''
    blocknum:已经下载的数据块
    blocksize:数据块的大小
    totalsize:远程文件的大小
    '''
    per = 100.0 * blocknum * blocksize / totalsize
    if per > 100:
        per = 100
    print('当前下载进度:%d' % per)


user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent': user_agent}
r = requests.get('http://www.ivsky.com/tupian/ziranfengguang/', headers=headers)
# 使用lxml解析网页
html = etree.HTML(r.text)
img_urls = html.xpath('.// img/@src')  # 先找到所有的img

mkdir_dir(Base_path)

i = 0
for img_url in img_urls:
    images = "https:" + img_url
    image_name = 'img' + str(i) + '.jpg'
    # print(images)
    request.urlretrieve(images, Base_path + "/" + image_name, Schedule)
    i += 1
# 方式2
# filename: 02.图片下载到本地-方式2.py
from urllib import request
import urllib

url = "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1565931658646&di=7481c26c3e7334361c58239fdba0b7d3&imgtype=0&src=http%3A%2F%2Fdownhdlogo.yy.com%2Fhdlogo%2F640640%2F640%2F640%2F45%2F1662457261%2Fu1662457261wYlxdOY.jpeg"
url1 = urllib.request.Request(url)  # Request()函数将url添加到头部,模拟浏览器访问

page = urllib.request.urlopen(url1).read()  # 将url页面的源码保存成字符串

# 下载图片write()方式
with open("cat2.jpg", 'wb') as f:
    f.write(page)
# 方式3
#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/5/22 14:08
# filename: urlib模块2.py
import urllib.parse
import urllib.request

url = "http://www.cncrk.com/up/1801/201801051228459384.jpg"

with urllib.request.urlopen(url) as response:
    data = response.read()
    f_name = "download.png"

    with open(f_name, "wb") as f:
        f.write(data)
        print("下载文件成功....")
data = requests.get("http:" + photo_url, headers=header)
with open(file_name, "wb") as f:
    f.write(data.content)  # 把图片内容写入文件


或者

try:
    response.raise_for_status()
    with open("image.jpg", "wb") as f:
        for chunk in response.iter_content(100000):
            f.write(chunk)
except Exception as e:
    print(e)

21.2.4. urllib3

import urllib3  # 导入标准库升级版模块

# 创建PoolManager对象,用于处理与线程的连接以及线程安全
http = urllib3.PoolManager()

# 发送网络请求
# response = http.request("GET", "http://www.baidu.com")
response = http.request("POST", "http://httpbin.org/post",fields={"word": "hello"})

print(response.data.decode())

21.2.5. requests

下载网页使用requests.get()方法

import requests

url = "http://www.deepstone.com.tw/"
htmlfile = requests.get(url)
if htmlfile.status_code == requests.codes.ok:
    print("取得网页内容成功")       # 取得网页内容成功
else:
    print("取得网页内容失败")

print("网页内容大小:{}".format(len(htmlfile.text)))       #网页内容大小:26649
print("\n")
print("网页源码内容如下:")
print(htmlfile.text)

认识Response对象

htmlfile.status_code == requests.codes.ok
print("网页内容大小:{}".format(len(htmlfile.text)))       #网页内容大小:26649
print("\n")
print("网页源码内容如下:")
print(htmlfile.text)

搜索页特定内容

import re
import requests

url = "http://www.deepstone.com.tw/"
htmlfile = requests.get(url)
if htmlfile.status_code == requests.codes.ok:
    pattern = input("请输入要搜索的字符串:")
    if pattern in htmlfile.text:
        print("搜索%s成功" % pattern)
    else:
        print("搜索%s失败" % pattern)

    # 使用方法2
    name = re.findall(pattern, htmlfile.text)
    if name != None:
        print("%s 出现 %d 次" % (pattern, len(name)))
    else:
        print("%s 出现 0 次" % pattern)
else:
    print("网页下载失败")


# 请输入要搜索的字符串:上奇科技
# 搜索上奇科技成功
# 上奇科技 出现 6 次

Request伪装成浏览器

import requests

url = "http://www.deepstone.com.tw/"
headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
        }


try:
    htmlfile = requests.get(url, headers=headers)
    htmlfile.raise_for_status()
    print("下载成功,且伪装成浏览器提取网络数据")

except Exception as e:
    print(e)



def get_one_page(url):
    """
    获取源码
    :param url:
    :return:
    """
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
        }
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            return response.text
        return None
    except Exception:
        return None

存储下载网页

import requests

url = "http://www.deepstone.com.tw/files/file_pool/2/0K021134525097875375/9789865004286_300.jpg"

def get_one_page(url):
    """
    获取源码
    :param url:
    :return:
    """
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
        }
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            return response.content
        return None
    except Exception:
        return None


# 使用request下载图片
html_text = get_one_page(url)
name_url = url.split("/")[-1]
with open(name_url, "wb") as file_Obj:
    file_Obj.write(html_text)
import requests  # 导入网络请求模块
#表单参数
data = {"word":"hello"}
# 发送网络请求
# response = requests.get("http://www.baidu.com/")
response = requests.post("http://httpbin.org/post",data)

print(response.status_code)
print(response.content.decode())

# # 打印状态码
# print("状态码:", response.status_code)
#
# # 打印请求地址
# print("请求地址", response.url)
#
# # 打印头部信息
# print("打印头部信息", response.headers)
# # 打印cookies信息
# print("cookie信息", response.cookies)
# # 打印文本源码
# print("打印文本源码", response.text)

# print("encoding---->", response.encoding)
# # 打印字节流源码
# print("打印字节码源码", response.content)
#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2020/5/6 10:35
# filename: sample01.py
import requests

# 以GET请求为例,最简单的形式如下:
r = requests.get('http://www.baidu.com')
print(r.content)

# 演示一下POST请求,同样是非常简短,更加具有Python风格
import requests

postdata = {'key': 'value'}
r = requests.post('https://www.douban.com/', data=postdata)
print(r.content)

# 请求头headers处理
import requests

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent': user_agent}
r = requests.get('http://www.baidu.com', headers=headers)
print(r.content)

# Cookie处理

# 如果响应中包含Cookie的值,可以如下方式获取Cookie字段的值,示例如下:
import requests

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent': user_agent}
r = requests.get('http://www.baidu.com', headers=headers)
# 遍历出所有的cookie字段的值
for cookie in r.cookies.keys():
    print(cookie + ':' + r.cookies.get(cookie))

# 重定向与历史信息
import requests

r = requests.get('http://github.com')
print(r.url)
print(r.status_code)
print(r.history)

# 超时设置
requests.get('http://github.com', timeout=2)


# 代理设置
import requests

proxies = {
    "http": "http://0.10.1.10:3128",
    "https": "http://10.10.1.10:1080",
}
requests.get("http://example.org", proxies=proxies)

21.2.6. 网络超时

import requests
#导入网络请求模块中的三种异常类
from requests.exceptions import ReadTimeout,HTTPError,RequestException

# 循环发送50次网络请求
for i in range(0, 50):
    try:
        response = requests.get("http://www.whatismyip.com/", timeout=0.5)
        print(response.status_code)    #打印请求码

    except ReadTimeout:
        # print("异常"+str(e))          #打印异常
        print("timeout ")


    except HTTPError:
        print("Http error")

    except RequestException:
        print("requerror")

21.2.7. 代理IP

国内代理服务器IP地址:

https://www.xicidaili.com/

方式一

import requests

proxy = {"https": "124.205.155.146:9090",
         "http": "218.64.69.79:8080"}



respones = requests.get("http://www.mingrisoft.com/", proxies=proxy)
print(respones.content)

方式二

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/8/11 21:30
# filename: 代理服务器的设置.py
import urllib.request


def use_proxy(proxy_addr, url):
    proxy = urllib.request.ProxyHandler({"http": proxy_addr})
    opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    data = urllib.request.urlopen(url).read().decode('utf-8')
    return data


proxy_addr = "222.249.238.138:8080"
data = use_proxy(proxy_addr, "https://movie.douban.com/")
# print(len(data))
print(data)
import urllib.request
url = 'https://movie.douban.com/'
# 设置代理IP
proxy_handler = urllib.request.ProxyHandler({
    'http': '218.56.132.157:8080',
    'https': '183.30.197.29:9797'})
# 必须使用build_opener()函数来创建带有代理IP功能的opener对象
opener = urllib.request.build_opener(proxy_handler)
response = opener.open(url)
html = response.read().decode('utf-8')
f = open('code3.txt', 'w', encoding='utf8')
f.write(html)
f.close()

21.2.9. 伪装成浏览器

使用headers头部信息

#!/usr/bin/env python
#-*- coding:utf8 -*-
# auther; 18793
# Date:2019/6/27 14:39
# filename: 伪装成浏览器.py


import urllib.request


url = 'http://www.ctrip.com/'

req = urllib.request.Request(url)
req.add_header('User-Agent',
               'Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Version/10.0 Mobile/14D27 Safari/602.1')

with urllib.request.urlopen(req) as response:
    data = response.read()
    htmlstr = data.decode()
    if htmlstr.find('mobile') != -1:
        print('移动版')
# 导入urllib
import urllib.request
url = 'https://movie.douban.com/'
# 自定义请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
    'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
    'Referer': 'https://movie.douban.com/',
    'Connection': 'keep-alive'}
# 设置request的请求头
req = urllib.request.Request(url, headers=headers)
# 使用urlopen打开req
html = urllib.request.urlopen(req).read().decode('utf-8')
# 写入文件
f = open('code2.txt', 'w', encoding='utf8')
f.write(html)
f.close()

21.2.10. 获取网页源码的函数

request方式一

import requests
def get_one_page(url):
    """
    获取源码
    :param url:
    :return:
    """
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except Exception:
        return None

request方式二

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/7/8 16:36
# filename: 01.浏览器模拟-headers属性01.py
import requests

def get_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }

    #使用请求头部之后,对请求进行了包装
    res = requests.get(url, headers=headers)

    try:
        print(res.text)
    except ConnectionError:
        print("拒绝连接")

request方式

import requests

def get_html(url, headers):
    html = requests.get(url, timeout=100, headers=headers).text
    return html

urllib方式三

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/9/12 22:53
# filename: example01.py
import urllib.request
import re


def getHtmlCode(url):
    """
    获取url返回的源代码
    :param url:
    :return:
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"}

    url1 = urllib.request.Request(url, headers=headers)  ## Request()函数将url添加到头部,模拟浏览器访问


    page = urllib.request.urlopen(url1).read()  # 将url页面的源代码保存成字符串
    page = page.decode('UTF-8')  # 字符串转码
    return page

print(getHtmlCode("https://movie.douban.com/"))

21.2.11. 几个图片爬取的示例

下载网页图片示例 (爬取指定网页中的图片(re).py 使用正则表达式)

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/8/16 10:20
# filename: 03.爬取指定网页中的图片(re).py
import urllib.request
import re
from typing import Any, Union

head = "https"


def getHtmlCode(url):
    """
    获取url返回的源代码
    :param url:
    :return:
    """
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}

    url1 = urllib.request.Request(url, headers=headers)  ## Request()函数将url添加到头部,模拟浏览器访问
    page = urllib.request.urlopen(url1).read()  # 将url页面的源代码保存成字符串
    page = page.decode('UTF-8')  # 字符串转码
    return page


def getImg(page):
    """
    该方法传入html源码,经过re获取img标签,将图片保存到本地
    :param page:
    :return:
    """

    imgList = re.findall(r'(http:[^\s]*?(jpg|png|gif))', page, re.S)  # 匹配非[^\s]空白字符
    x = 0
    for imgUrl in imgList:
        try:
            print("正在下载:{}".format(imgUrl[0]))
            # urllib.request.urlretrieve(url,local)
            p_name = imgUrl[0].split('/')[-1]
            p_name = str(x) + "-" + p_name

            urllib.request.urlretrieve(imgUrl[0],
                                       'D:/21-DAY-Python/21.python项目开发案例入门到实战/06.抓取百度图片/photo/{}'.format(p_name))
            x += 1
        except:
            continue


if __name__ == '__main__':
    url = "http://www.xz577.com/e/cxsj/"
    page = getHtmlCode(url)
    getImg(page)
../../_images/re-down-photo001.png

下载网页图片示例 (爬取指定网页中的图片(Beautifulsoup).py 使用Beautifulsoup实现)

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/8/16 10:20
# filename: 03.爬取指定网页中的图片(Beautiful).py
import urllib.request
import re
from bs4 import BeautifulSoup

head = "https"


def getHtmlCode(url):
    """
    获取url返回的源代码
    :param url:
    :return:
    """
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}

    url1 = urllib.request.Request(url, headers=headers)  ## Request()函数将url添加到头部,模拟浏览器访问
    page = urllib.request.urlopen(url1).read()  # 将url页面的源代码保存成字符串
    page = page.decode('UTF-8')  # 字符串转码
    return page


def getImg(page, LocalPath):
    """
    该方法传入html源码,经过re获取img标签,将图片保存到本地
    :param page:
    :return:
    """

    soup = BeautifulSoup(page, "lxml")  # 按照lxml格式解析页面
    imgList = soup.find_all('img')  # 返回包含所有img标签的列表
    x = 0
    for imgUrl in imgList:
        imgUrl_info = imgUrl.get('src')
        try:
            print("正在下载:{}".format(imgUrl_info))
            # urllib.request.urlretrieve(url,local)
            urllib.request.urlretrieve(imgUrl_info,
                                       LocalPath + "%d.jpg" % x)
            x += 1
        except:
            continue


if __name__ == '__main__':
    url = "http://www.xz577.com/e/cxsj/"
    LocalPath = "D:/21-DAY-Python/21.python项目开发案例入门到实战/06.抓取百度图片/photo/"
    page = getHtmlCode(url)
    getImg(page, LocalPath)

使用request库和Beautiful完成图片爬取

request库使用

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/8/16 11:30
# filename: 04.使用request库和Beautiful完成图片爬取.py
import os
import requests
from bs4 import BeautifulSoup


def getHtmkCode(url):
    """
    获取网页源码
    :param url:
    :return:
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"}
    r = requests.get(url, headers=headers)
    r.encoding = 'utf-8'  # 指定网页解析的编码格式
    page = r.text  # 获取url页面的源码字符串文本
    return page


def getImg(page, localPath):
    """
    下载图片
    :param page: 源码
    :param localPath: 本地存放路径
    :return:
    """
    if not os.path.exists(localPath):  # 新建文件夹
        os.mkdir(localPath)
    soup = BeautifulSoup(page, 'lxml')  # 按照lxml格式去解析页面
    imgList = soup.find_all('img')  # 包含所有img标签的列表
    x = 0
    base_url = "http://www.zut.edu.cn/"
    for imgUrl in imgList:
        # 循环列表
        try:
            print("正在下载:{}".format(imgUrl.get('src')))
            if "http://" not in imgUrl.get('src'):  # 不是绝对路径http开始
                m = base_url + imgUrl.get('src')
                print("正在下载:{}".format(m))
                ir = requests.get(base_url + imgUrl.get('src'))
            else:
                ir = requests.get(imgUrl.get('src'))
                # 使用write()方法写入本地文件中
            with open(localPath + "%d.jpg" % x, "wb") as f:
                f.write(ir.content)
            x += 1
        except:
            continue


if __name__ == '__main__':
    url = "http://www.zut.edu.cn/"
    localPath = "D:/21-DAY-Python/21.python项目开发案例入门到实战/06.抓取百度图片/request_download/"
    page = getHtmkCode(url)
    getImg(page, localPath)

下载搜狗图片示例

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/8/16 13:32
# filename: 05.爬取搜狗图片例子.py
import requests
import urllib.request
from bs4 import BeautifulSoup
import json
import os

"""
https://pic.sogou.com/pics?query=%D0%A3%BB%A8&mode=1&start=144&reqType=ajax&reqFrom=result&tn=0
https://pic.sogou.com/pics?query=%D0%A3%BB%A8&mode=1&start=192&reqType=ajax&reqFrom=result&tn=0
https://pic.sogou.com/pics?query=%D0%A3%BB%A8&mode=1&start=240&reqType=ajax&reqFrom=result&tn=0
https://pic.sogou.com/pics?query=%D0%A3%BB%A8&mode=1&start=288&reqType=ajax&reqFrom=result&tn=0

start=48的规律进行翻页操作
"""

image_urls = []
m = 0

def getSogouImage(length, path):
    global m

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
    n = length
    # cate = category
    imgs = requests.get(
        "https://pic.sogou.com/pics?query=%D0%A3%BB%A8&mode=1&start={}&reqType=ajax&reqFrom=result&tn=0".format(n),
        headers=headers)

    jd = json.loads(imgs.text)
    jd = jd['items']
    for i in jd:
        image_urls.append(i['pic_url'])


    for img_url in image_urls:
        # print(i)
        print("****Downloading【{0}】 --->{1}.jpg *********".format(img_url, m))
        try:
            urllib.request.urlretrieve(img_url, path + str(m) + ".jpg")
        except:
            continue
        m += 1

    print("*******************【Download completed page!】**********************************************************")


    # try:
    #     with urllib.request.urlopen(img_url) as response:
    #         data = response.read()
    #         f_name = path + str(m) + ".jpg"
    #         with open(f_name, "wb") as f:
    #             f.write(data)
    # except:
    #     continue
    # m += 1

    # print("Download completed page!!")


if __name__ == '__main__':

    localPath = "D:/21-DAY-Python/21.python项目开发案例入门到实战/06.抓取百度图片/ajax_download/"
    if not os.path.exists(localPath):
        os.mkdir(localPath)

    # 进行翻页操作
    for i in range(144, 1000, 48):
        getSogouImage(i, localPath)
../../_images/sougou_download0001.png ../../_images/download_sougou0002.png

21.2.12. 根据输入爬取百度图片

代码示例

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/8/26 13:20
# filename: te01.py

"""
https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=宝马&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word=宝马&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=30&rn=30&gsm=5a&1566797235407=
https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=宝马&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word=宝马&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=60&rn=30&gsm=5a&1566797235407=
https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=宝马&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word=宝马&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=90&rn=30&gsm=5a&1566797235407=
https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=宝马&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word=宝马&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=120&rn=30&gsm=5a&1566797235407=

翻页30一页
"""

import requests
import re
import json
import time
import os
from urllib import request
import urllib

Download_dir = "images"


def get_one_page(url):
    """
    获取源码
    :param url:
    :return:
    """
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except Exception:
        return None


# url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%AE%9D%E9%A9%AC&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word=%E5%AE%9D%E9%A9%AC&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=30&rn=30&gsm=1e&1566797233720='

def search_info(url):
    """
    抓取信息函数
    :param url:
    :return:
    """

    data = json.loads(get_one_page(url))['data']

    Title = None
    Photo_Url = None
    flag = 0
    for i in data:
        try:
            if i['fromPageTitleEnc']:
                Title = i['fromPageTitleEnc'].strip()
            else:
                Title = ''

            if i['hoverURL']:
                Photo_Url = i['hoverURL']
            else:
                Photo_Url = ''

            # print(Download_dir + "/" + str(flag) + "_" + Title + ".jpg")
            url_jpg = urllib.request.Request(Photo_Url)
            page = urllib.request.urlopen(url_jpg).read()  # 将url页面的源码保存成字符串
            image_names = Download_dir + "/" + str(flag) + "_" + Title + ".jpg"
            # 下载图片write()方式
            with open(image_names, 'wb') as f:
                f.write(page)
            flag += 1
            print("正在下载:{0}.图片下载地址:{1}".format(Title, Photo_Url))
        except:
            pass


def main():
    if not os.path.exists(Download_dir):
        os.mkdir(Download_dir)

    url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={0}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word={1}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn={2}&rn=30&gsm=5a&1566797235407='
    input_info = input("请输入要搜索并爬取的图片:").strip()
    for i in range(30, 800, 30):
        # print(url.format(input_info, input_info, i))
        try:
            search_info(url.format(input_info, input_info, i))
            time.sleep(0.5)
        except:
            pass


if __name__ == '__main__':
    main()
../../_images/baidu_paqu0001.png

示例代码

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/8/26 13:10
# filename: example.py
import requests
import re

# 设置默认配置
MaxSearchPage = 20  # 搜索页数
CurrentPage = 0  # 当前正在搜索的页数
DefaultPath = "pictures"  # 默认存储位置
NeedSave = 0  # 是否需要存储


def imageFiler(content):
    """
    通过正则获取当前页面的图片地址数组
    :param content:
    :return:
    """
    return re.findall('"objURL":"(.*?)"', content, re.S)


def nextSource(content):
    """
    通过正则获取下一页的网址
    :param content:
    :return:
    """
    next = re.findall('<div id ="page">.*<a href="(.*?)" class="n"', content, re.S)[0]
    print("-------------------------" + "http://image.baidu.com" + next)
    return next


def spidler(source):
    """
    爬虫主体
    :param source:
    :return:
    """
    content = requests.get(source).text  # 通过连接获取内容
    imageArr = imageFiler(content)  # 获取图片数组
    global CurrentPage
    print("Current page:" + str(CurrentPage) + "*******************")
    for imageUrl in imageArr:
        print(imageUrl)
        global NeedSave
        if NeedSave:
            global DefaultPath
            try:
                picture = requests.get(imageUrl, timeout=10)
            except:
                print("Download image error! errorUrl:" + imageUrl)
                continue

            # 创建图片保存路径
            imageUrl = imageUrl.replace('/', '').replace(':', '').replace('?', '')
            pictureSacePATH = DefaultPath + imageUrl
            with open(pictureSacePATH, 'wb') as fb:
                fb.write(picture.content)

            global MaxSearchPage
            if CurrentPage <= MaxSearchPage:  # 继续下一页爬取
                if nextSource(content):
                    CurrentPage += 1
                    # 爬取完毕后通过下一页地址继续爬起
                    spidler("http://image.baidu.com" + nextSource(content))


def beginSearch(page=1, save=0, savePath="prctures/"):
    """
    爬虫的开启方法
    :param page: 爬取页数
    :param save: 是否存储,0为不存储,1为存储
    :param savePath: 默认存储路径
    :return:
    """
    global MaxSearchPage, NeedSave, DefaultPath
    MaxSearchPage = page
    NeedSave = save  # 是否保存
    DefaultPath = savePath  # 图片保存位置
    key = input("请输入you want search: ")
    StartSource = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={0}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word={1}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=30&rn=30&gsm=5a&1566797235407="
    StartSource.format(key, key)
    spidler(StartSource)


if __name__ == '__main__':
    beginSearch(page=5, save=1)

21.2.13. 图片爬虫

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/9/19 15:53
# filename: 01.图片爬虫实战.py
'''
翻页信息
http://www.xz577.com/e/python/2.html
http://www.xz577.com/e/python/3.html
http://www.xz577.com/e/python/4.html
http://www.xz577.com/e/python/5.html
http://www.xz577.com/e/python/6.html

'''

import re
import urllib
from urllib import request
import os
import time


class Book_spider:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
        }
        self.start_page = 2
        self.start = 0
        self.title_html = "http://www.xz577.com"
        self.html = "http://www.xz577.com/e/python/"
        self.path = os.path.abspath(os.path.dirname(os.path.basename(__file__)))

    def get_books_info(self):
        try:
            url = self.html + str(self.start_page) + ".html"
            req = request.Request(url, headers=self.headers)
            response = request.urlopen(req)
            page = response.read().decode('utf-8')
            self.start += 1
            return page
        except Exception as e:
            print(print("抓取失败,失败原因:{}".format(e)))

    def get_image_down(self):
        html = self.get_books_info()
        print("正在获取第{}页数据".format(self.start))
        print("------------------------------------------------------------------------------------------")
        print("爬取的网站是: {}".format(self.html + str(self.start_page) + ".html"))
        print("------------------------------------------------------------------------------------------")
        print()
        pat1 = '<a href=(.*?) title=(.*?)target="_blank">.*?</a>'
        pat2 = '<img src=(.*?) alt=.*?>'
        result1 = re.compile(pat1).findall(html, re.S)
        result2 = re.compile(pat2).findall(html, re.S)

        links = []
        books_name = []
        book_path = []

        for i in result1:
            link = self.title_html + i[0].strip("\"")
            book_name = i[1]

            links.append(link)
            books_name.append(book_name)

        for name in books_name:
            file_dir_name = self.path + "/" + str(name).replace("\"", '')
            book_path.append(file_dir_name)

        # print(books_name)

        for path, photo_link, book_name, link in zip(book_path, result2, books_name, links):
            try:
                if not os.path.exists(path):
                    os.mkdir(path)
            except:
                pass
            image_jpg_name = str(path).strip() + "/" + str(path).split('/')[1].strip() + '.jpg'
            photo_link = str(photo_link).replace("\"", '').strip()
            # # print("正在下载:{!r} 【图片下载地址】:{!r}".format(image_jpg_name, photo_link))
            # # print("书籍名称:{!r}  下载链接:{!r}".format(book_name, link))
            try:
                urllib.request.urlretrieve(photo_link, image_jpg_name)
            except Exception as e:
                print(e)

            path_book_txt = str(path).strip() + "/" + str(path).split('/')[1].strip() + '.txt'
            try:
                with open(path_book_txt, 'w', encoding="utf-8") as f:
                    f.write("书籍名称:{!r}\n下载链接地址:{!r}".format(book_name, link))
            except:
                pass

    def main(self):
        while self.start < 5:
            self.get_image_down()
            self.start_page += 1
            time.sleep(1)


if __name__ == '__main__':
    t1 = Book_spider()
    t1.main()

21.2.14. 链接爬虫

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/9/20 19:08
# filename: 02.链接爬虫2.py
import re
import urllib.request


class A:
    def __init__(self):
        self.headers = ("user-agent",
                        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36")
        self.url = "https://blog.csdn.net"

    def get_html(self, url):
        # 模拟成浏览器
        opener = urllib.request.build_opener()
        opener.addheaders = [self.headers]
        # 将opener安装为全局
        urllib.request.install_opener(opener)
        file = urllib.request.urlopen(url)
        data = str(file.read().decode("utf-8"))
        return data

    def get_link(self):
        data = self.get_html(self.url)
        pat1 = '(https?://[^\s)";]+\.(\w|/)*)'
        re_data = re.compile(pat1, re.S).findall(data)
        list_links = list(set(re_data))
        return list_links


if __name__ == '__main__':
    h1 = A()
    links = h1.get_link()
    for link in links:
        print(link)