Contents

案例：爬取中国天气网图片
- 使用正则匹配爬取
- 使用BeautufulSoup库进行爬取

21.17. 案例：爬取中国天气网图片 ¶

21.17.1. 使用正则匹配爬取 ¶

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date：2019/6/27 14:49
# filename: 爬取中国天气网图片.py


import urllib.request

import os
import re

url = 'http://p.weather.com.cn/'


def findallimageurl(htmlstr):
    """从HTML代码中查找匹配的字符串"""

    # 定义正则表达式
    pattern = r'http://\S+(?:\.png|\.jpg)'
    return re.findall(pattern, htmlstr)


def getfilename(urlstr):
    """根据图片连接地址截取图片名"""

    pos = urlstr.rfind('/')
    return urlstr[pos + 1:]


# 分析获得的url列表
url_list = []
req = urllib.request.Request(url)
with urllib.request.urlopen(req) as response:
    data = response.read()
    htmlstr = data.decode()

    url_list = findallimageurl(htmlstr)

for imagesrc in url_list:
    # 根据图片地址下载
    req = urllib.request.Request(imagesrc)
    with urllib.request.urlopen(req) as response:
        data = response.read()
        # 过滤掉用小于100kb字节的图片
        if len(data) < 1024 * 100:
            continue

        # 创建download文件夹
        if not os.path.exists('download'):
            os.mkdir('download')

        # 获得图片文件名
        filename = getfilename(imagesrc)
        filename = 'download/' + filename
        # 保存图片到本地
        with open(filename, 'wb') as f:
            f.write(data)

    print('下载图片', filename)

21.17.2. 使用BeautufulSoup库进行爬取 ¶

#!/usr/bin/env python
#-*- coding:utf8 -*-
# auther; 18793
# Date：2019/6/27 14:54
# filename: 爬取中国天气图片2.py


import os
import urllib.request

from bs4 import BeautifulSoup

url = 'http://p.weather.com.cn/'


def findallimageurl(htmlstr):
    """从HTML代码中查找匹配的字符串"""

    sp = BeautifulSoup(htmlstr, 'html.parser') #html.parser html.parser
    # 返回所有的img标签对象
    imgtaglist = sp.find_all('img')

    # 从img标签对象列表中返回对应的src列表
    srclist = list(map(lambda u: u.get('src'), imgtaglist))
    # 过滤掉非.png和.jpg结尾文件src字符串
    filtered_srclist = filter(lambda u: u.lower().endswith('.png')
                                        or u.lower().endswith('.jpg'), srclist)

    return filtered_srclist


def getfilename(urlstr):
    """根据图片连接地址截取图片名"""

    pos = urlstr.rfind('/')
    return urlstr[pos + 1:]


# 分析获得的url列表
url_list = []
req = urllib.request.Request(url)
with urllib.request.urlopen(req) as response:
    data = response.read()
    htmlstr = data.decode()

    url_list = findallimageurl(htmlstr)

for imagesrc in url_list:
    # 根据图片地址下载
    req = urllib.request.Request(imagesrc)
    with urllib.request.urlopen(req) as response:
        data = response.read()
        # 过滤掉用小于100kb字节的图片
        if len(data) < 1024 * 100:
            continue

        # 创建download文件夹
        if not os.path.exists('download'):
            os.mkdir('download')

        # 获得图片文件名
        filename = getfilename(imagesrc)
        filename = 'download/' + filename
        # 保存图片到本地
        with open(filename, 'wb') as f:
            f.write(data)

    print('下载图片', filename)