21.26. 爬取酷狗TOP500的数据

网页访问URL地址:

https://www.kugou.com/yy/rank/home/2-8888.html?from=rank

通过观察URL翻页是通过更换home/后面的数字

https://www.kugou.com/yy/rank/home/1xxx
https://www.kugou.com/yy/rank/home/2xxx

21.26.1. 代码示例

#!/usr/bin/env python
#-*- coding:utf8 -*-
from bs4 import BeautifulSoup
import requests
import time             #导入相应的库文件


headers = {
"User-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
              "(KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"

}


def get_info(url):
    wb_data = requests.get(url,headers=headers)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    ranks = soup.select('span.pc_temp_num')
    titles = soup.select('div.pc_temp_songlist > ul > li > a')
    times = soup.select('span.pc_temp_tips_r > span')
    for rank,title,time in zip(ranks,titles,times):
        data = {
            "rank": rank.get_text().strip(),
            "singer":title.get_text().split('-')[0],
            "song":"".join(title.get_text().split('-')[1:]),
            "time":time.get_text().strip(),
        }
        print(data)

if __name__ == '__main__':
    urls = ['https://www.kugou.com/yy/rank/home/{}-8888.html?from=rank'.format((str(i))) for i in range(1,24)]      #构造多页url

    for url in urls:
        get_info(url)   #循环调用get_info()函数
    time.sleep(1)       #休眠时间1秒

输出结果

{'rank': '1', 'singer': '展展与罗罗 ', 'time': '5:38', 'song': ' 沙漠骆驼'}
{'rank': '2', 'singer': '花姐 ', 'time': '3:10', 'song': ' 夜之光'}
{'rank': '3', 'singer': '张紫豪 ', 'time': '4:00', 'song': ' 可不可以'}
{'rank': '4', 'singer': 'G.E.M.邓紫棋 ', 'time': '3:55', 'song': ' 光年之外'}
{'rank': '5', 'singer': '李荣浩 ', 'time': '4:39', 'song': ' 年少有为'}
{'rank': '6', 'singer': '何野 ', 'time': '4:00', 'song': ' 天亮以前说再见'}
{'rank': '7', 'singer': '贺一航 ', 'time': '4:49', 'song': ' 请先说你好'}
{'rank': '8', 'singer': '王大毛 ', 'time': '4:05', 'song': ' 去年夏天'}
{'rank': '9', 'singer': '马良、孙茜茹 ', 'time': '3:56', 'song': ' 往后余生'}
{'rank': '10', 'singer': '火箭少女101 ', 'time': '3:52', 'song': ' 卡路里'}
{'rank': '11', 'singer': '周笔畅 ', 'time': '3:30', 'song': ' 最美的期待'}
{'rank': '12', 'singer': '221小伙伴 ', 'time': '3:36', 'song': ' 遥远的你 (正式版)'}
{'rank': '13', 'singer': '李袁杰 ', 'time': '3:40', 'song': ' 醉千年'}
{'rank': '14', 'singer': 'G.G(张思源 ) ', 'time': '3:16', 'song': ' 给陌生的你听'}
{'rank': '15', 'singer': '于果 ', 'time': '3:37', 'song': ' 侧脸'}
{'rank': '16', 'singer': '于文文 ', 'time': '4:42', 'song': ' 体面'}
{'rank': '17', 'singer': '胡夏、郁可唯 ', 'time': '4:36', 'song': ' 知否知否'}
{'rank': '18', 'singer': 'Ayo97、阿涵 ', 'time': '3:57', 'song': ' 感谢你曾来过'}
................

21.26.2. 爬取酷狗音乐飙升榜

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/7/10 23:45
# filename: 爬取酷狗音乐飙升榜.py
import requests
from bs4 import BeautifulSoup
import time
import re
from lxml import etree

"""
使用3种方式爬取酷狗音乐的飙升榜

"""

url = "https://www.kugou.com/yy/html/rank.html"


def get_html(url):
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        html = response.text
        return html
    except:
        print("request error")


def Re_serach(url):
    """
    使用re正则抓取酷狗音乐
    :param url:
    :return:
    """
    html_info = get_html(url)
    info1 = re.compile('<li class=" " title="(.*?)">')
    music_html = re.compile('<a href="(.*?)" data-active="playDwn" data-index="\d"')
    music_info_all = info1.findall(html_info, re.S)
    music_songs = music_html.findall(html_info, re.S)
    for music, music_song in zip(music_info_all, music_songs):
        info = {
            "歌曲排名:": str(music).split("data-index=\"")[1],
            "歌名": str(music).split("data-index=\"")[0],
            "播放链接:": music_song
        }
        print(info)


def Beautifulsoup_search(url):
    """
    使用Beautifulsoup抓取酷狗
    :param url:
    :return:
    """
    html_info = get_html(url)
    soup = BeautifulSoup(html_info, "lxml")
    song_ids = soup.select("#rankWrap > div.pc_temp_songlist.pc_rank_songlist_short > ul > li > a")
    # print(song_ids)
    # song_ids = soup.select("#rankWrap > div.pc_temp_songlist.pc_rank_songlist_short > ul > li > a")
    # song_id = re.findall("\"\d\"", str(song_id))
    song_names = soup.select("#rankWrap > div.pc_temp_songlist.pc_rank_songlist_short > ul > li > a")
    URL_songs = soup.select("#rankWrap > div.pc_temp_songlist.pc_rank_songlist_short > ul > li > a")

    for song_id, song_name, URL_song in zip(song_ids, song_names, URL_songs):
        data = {
            "歌曲排行": int(song_id.get("data-index")) + 1,
            "歌曲名称": song_name.get_text().strip(),
            "歌曲链接": URL_song.get("href")
        }
        print(data)


def Xapth_select(url):
    html = get_html(url)
    selector = etree.HTML(html)
    url_infos = selector.xpath('//*[@id="rankWrap"]/div[2]')
    for urlinfo in url_infos:
        song_names = urlinfo.xpath('ul/li/a/text()')
        ids = urlinfo.xpath('ul/li/span[3][@ class="pc_temp_num"]/text()')
        ids = [str(id).strip("\t").strip("\r").strip("\n").strip() for id in ids]
        idss = [id for id in ids if id]
        song_links = urlinfo.xpath('ul/li/a/@href')

        for songnam, id, song_link in zip(song_names, idss, song_links):
            data = {
                "歌曲名称": songnam,
                "歌曲排名": int(id) - 3,
                "歌曲链接": song_link
            }
            print(data)


if __name__ == '__main__':
    # Re_serach(url)
    # Beautifulsoup_search(url)
    Xapth_select(url)

输出信息

{'歌曲排名': 1, '歌曲名称': '郑冰冰 - 渡我不渡她', '歌曲链接': 'https://www.kugou.com/song/yc2xsff.html'}
{'歌曲排名': 2, '歌曲名称': '徐子崴 - 我想和你好好的', '歌曲链接': 'https://www.kugou.com/song/kfura8.html'}
{'歌曲排名': 3, '歌曲名称': '屈杨 - 有一种悲伤 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydvg058.html'}
{'歌曲排名': 4, '歌曲名称': '由博文 - Simon (Live)', '歌曲链接': 'https://www.kugou.com/song/ydvfjce.html'}
{'歌曲排名': 5, '歌曲名称': '肖蔷 - 无问西东 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydvdx58.html'}
{'歌曲排名': 6, '歌曲名称': '陈其楠 - 未来 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydvdz1e.html'}
{'歌曲排名': 7, '歌曲名称': 'CPU - 玫瑰玫瑰我爱你 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydvcaa5.html'}
{'歌曲排名': 8, '歌曲名称': '潘玮柏、SeanT肖恩恩、黄旭 - 爱你3000 (Live)', '歌曲链接': 'https://www.kugou.com/song/ye02b10.html'}
{'歌曲排名': 9, '歌曲名称': '田颖 - 寂寞难耐 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydve55d.html'}
{'歌曲排名': 10, '歌曲名称': '陈小同 - 贫穷或富有 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydveq9f.html'}
{'歌曲排名': 11, '歌曲名称': 'MC Hotdog、张震岳、Creamd、Capper - 改变 (Live)', '歌曲链接': 'https://www.kugou.com/song/ye02i9e.html'}
{'歌曲排名': 12, '歌曲名称': '卓玛殷措 - 爱是怀疑 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydwo481.html'}
{'歌曲排名': 13, '歌曲名称': '孙振宇 - 对他说我愿意 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydvh16d.html'}
{'歌曲排名': 14, '歌曲名称': '杨和苏KeyNG、SeanT肖恩恩 - Put it up (Live)', '歌曲链接': 'https://www.kugou.com/song/y51r65f.html'}
{'歌曲排名': 15, '歌曲名称': '豆心 - 一个', '歌曲链接': 'https://www.kugou.com/song/lru4q16.html'}
{'歌曲排名': 16, '歌曲名称': '黄旭 - 孤独 (Live)', '歌曲链接': 'https://www.kugou.com/song/ye07g2f.html'}
{'歌曲排名': 17, '歌曲名称': '新秀 - 不负 (Live)', '歌曲链接': 'https://www.kugou.com/song/ye07d94.html'}
{'歌曲排名': 18, '歌曲名称': '于果 - 鸟儿飞', '歌曲链接': 'https://www.kugou.com/song/xb34d6b.html'}
{'歌曲排名': 19, '歌曲名称': '欢子 - 保重', '歌曲链接': 'https://www.kugou.com/song/ydvipe5.html'}