21.8. 3种爬虫模式性能对比

21.8.1. 举例1

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/7/12 16:01
# filename: 3种爬虫模式对比.py

# 爬取数据只做返回,不存储
import requests
import re
from bs4 import BeautifulSoup
from lxml import etree
import time

# 加入请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}

urls = ["https://www.qiushibaike.com/text/page/{}".format(str(i)) for i in range(1, 5)]  # 构造url


def re_scraper(url):
    '''
    :param url:
    :return:  正则爬取的时间
    '''
    res = requests.get(url,headers=headers)
    ids = re.findall("<h2>(.*?)</h2>", res.text, re.S)
    contents = re.findall('<div class="content">.*?<span>(.*?)</span>', res.text, re.S)
    laughs = re.findall('<span class="stats-vote"><i class="number">(\d+)</i> 好笑</span>', res.text, re.S)
    comments = re.findall('<i class="number">(\d+)</i> 评论', res.text, re.S)
    for id, content, laugh, comment in zip(ids, contents, laughs, comments):
        info = {
            "id": id,
            "content": content,
            "laugh": laugh,
            "comment": comments[0]
        }
    return info


def bs_scraper(url):
    '''

    :param url: Beautifulsoup爬取时间
    :return:
    '''
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, 'lxml')
    ids = soup.select(" a > h2")
    contents = soup.select("div > span")
    laughs = soup.select("span.stats-vote > i")
    comments = soup.select("i.number")
    for id, content, laugh, comment in zip(ids, contents, laughs, comments):
        info = {
            'id': id.get_text(),
            'content': content.get_text(),
            'laugh': laugh.get_text(),
            'comment': comment.get_text()
        }
    return info


def lxml_scraper(url):
    '''
    :param url:
    :return:lxml爬虫爬取时间
    '''
    res = requests.get(url, headers=headers)
    selector = etree.HTML(res.text)
    url_infos = selector.xpath('//div[@class="article block untagged mb15 typs_hot"]')
    try:
        for url_info in url_infos:
            id = url_info.xpath("div[1]/a[2]/h2/text()")[0]
            content = url_info.xpath("a[1]/div/span/text()")[0]
            laugh = url_info.xpath("div[2]/span[1]/i/text()")[0]
            comment = url_info.xpath("div[2]/span[2]/a/i/text()")[0]

        info = {
            "id": id,
            "content": content,
            "laugh": laugh,
            "comment": comment
        }
        return info
    except IndexError:
        pass  # 异常忽略掉


if __name__ == '__main__':
    for name, scraper in [("RE_exoressions", re_scraper), ("BeautifulSoup", bs_scraper), ("Lxml", lxml_scraper)]:
        start = time.time()
        for url in urls:
            scraper(url)
        end = time.time()
        print(name, end - start)

输出结果

C:\Users\18793\Anaconda3\python.exe D:/GitHub/爬虫学习/3.第一个爬虫程序/3种爬虫方式的性能比较.py
RE_exoressions 3.148746967315674
BeautifulSoup 3.109945297241211
Lxml 2.79829740524292

21.8.2. 举例2

re、Beautifulsoup、lxml三种方式爬取酷狗音乐飙升榜

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/7/10 23:45
# filename: 爬取酷狗音乐飙升榜.py
import requests
from bs4 import BeautifulSoup
import time
import re
from lxml import etree

"""
使用3种方式爬取酷狗音乐的飙升榜

"""

url = "https://www.kugou.com/yy/html/rank.html"


def get_html(url):
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        html = response.text
        return html
    except:
        print("request error")


def Re_serach(url):
    """
    使用re正则抓取酷狗音乐
    :param url:
    :return:
    """
    html_info = get_html(url)
    info1 = re.compile('<li class=" " title="(.*?)">')
    music_html = re.compile('<a href="(.*?)" data-active="playDwn" data-index="\d"')
    music_info_all = info1.findall(html_info, re.S)
    music_songs = music_html.findall(html_info, re.S)
    for music, music_song in zip(music_info_all, music_songs):
        info = {
            "歌曲排名:": str(music).split("data-index=\"")[1],
            "歌名": str(music).split("data-index=\"")[0],
            "播放链接:": music_song
        }
        print(info)


def Beautifulsoup_search(url):
    """
    使用Beautifulsoup抓取酷狗
    :param url:
    :return:
    """
    html_info = get_html(url)
    soup = BeautifulSoup(html_info, "lxml")
    song_ids = soup.select("#rankWrap > div.pc_temp_songlist.pc_rank_songlist_short > ul > li > a")
    # print(song_ids)
    # song_ids = soup.select("#rankWrap > div.pc_temp_songlist.pc_rank_songlist_short > ul > li > a")
    # song_id = re.findall("\"\d\"", str(song_id))
    song_names = soup.select("#rankWrap > div.pc_temp_songlist.pc_rank_songlist_short > ul > li > a")
    URL_songs = soup.select("#rankWrap > div.pc_temp_songlist.pc_rank_songlist_short > ul > li > a")

    for song_id, song_name, URL_song in zip(song_ids, song_names, URL_songs):
        data = {
            "歌曲排行": int(song_id.get("data-index")) + 1,
            "歌曲名称": song_name.get_text().strip(),
            "歌曲链接": URL_song.get("href")
        }
        print(data)


def Xapth_select(url):
    """
    :param url:
    :return:lxml爬虫爬取
    """

    html = get_html(url)
    selector = etree.HTML(html)
    url_infos = selector.xpath('//*[@id="rankWrap"]/div[2]')
    for urlinfo in url_infos:
        song_names = urlinfo.xpath('ul/li/a/text()')
        ids = urlinfo.xpath('ul/li/span[3][@ class="pc_temp_num"]/text()')
        ids = [str(id).strip("\t").strip("\r").strip("\n").strip() for id in ids]
        idss = [id for id in ids if id]
        song_links = urlinfo.xpath('ul/li/a/@href')

        for songnam, id, song_link in zip(song_names, idss, song_links):
            data = {
                "歌曲名称": songnam,
                "歌曲排名": int(id) - 3,
                "歌曲链接": song_link
            }
            print(data)


if __name__ == '__main__':
    # Re_serach(url)
    # Beautifulsoup_search(url)
    Xapth_select(url)

总结

  • 正则 使用困难,性能快,需要安装内置re模块

  • Beautifulsoup 使用简单,速度性能慢,安装简单

  • Lxml 性能快、使用简单,安装相对困难

当网页结构简单并且想要避免额外依赖的话(不需要安装库),使用正则表达式更为合适。
当需要爬取的数据量较少时,使用较慢的BeautifulSoup 也不成问题。
当数据量大,需要追求效益时,Lxml是最好的选择。
../../_images/pacong000002.png