Contents
21.26. 爬取酷狗TOP500的数据¶
网页访问URL地址:
https://www.kugou.com/yy/rank/home/2-8888.html?from=rank
通过观察URL翻页是通过更换home/后面的数字
https://www.kugou.com/yy/rank/home/1xxx
https://www.kugou.com/yy/rank/home/2xxx
21.26.1. 代码示例¶
#!/usr/bin/env python
#-*- coding:utf8 -*-
from bs4 import BeautifulSoup
import requests
import time #导入相应的库文件
headers = {
"User-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
}
def get_info(url):
wb_data = requests.get(url,headers=headers)
soup = BeautifulSoup(wb_data.text, 'lxml')
ranks = soup.select('span.pc_temp_num')
titles = soup.select('div.pc_temp_songlist > ul > li > a')
times = soup.select('span.pc_temp_tips_r > span')
for rank,title,time in zip(ranks,titles,times):
data = {
"rank": rank.get_text().strip(),
"singer":title.get_text().split('-')[0],
"song":"".join(title.get_text().split('-')[1:]),
"time":time.get_text().strip(),
}
print(data)
if __name__ == '__main__':
urls = ['https://www.kugou.com/yy/rank/home/{}-8888.html?from=rank'.format((str(i))) for i in range(1,24)] #构造多页url
for url in urls:
get_info(url) #循环调用get_info()函数
time.sleep(1) #休眠时间1秒
输出结果¶
{'rank': '1', 'singer': '展展与罗罗 ', 'time': '5:38', 'song': ' 沙漠骆驼'}
{'rank': '2', 'singer': '花姐 ', 'time': '3:10', 'song': ' 夜之光'}
{'rank': '3', 'singer': '张紫豪 ', 'time': '4:00', 'song': ' 可不可以'}
{'rank': '4', 'singer': 'G.E.M.邓紫棋 ', 'time': '3:55', 'song': ' 光年之外'}
{'rank': '5', 'singer': '李荣浩 ', 'time': '4:39', 'song': ' 年少有为'}
{'rank': '6', 'singer': '何野 ', 'time': '4:00', 'song': ' 天亮以前说再见'}
{'rank': '7', 'singer': '贺一航 ', 'time': '4:49', 'song': ' 请先说你好'}
{'rank': '8', 'singer': '王大毛 ', 'time': '4:05', 'song': ' 去年夏天'}
{'rank': '9', 'singer': '马良、孙茜茹 ', 'time': '3:56', 'song': ' 往后余生'}
{'rank': '10', 'singer': '火箭少女101 ', 'time': '3:52', 'song': ' 卡路里'}
{'rank': '11', 'singer': '周笔畅 ', 'time': '3:30', 'song': ' 最美的期待'}
{'rank': '12', 'singer': '221小伙伴 ', 'time': '3:36', 'song': ' 遥远的你 (正式版)'}
{'rank': '13', 'singer': '李袁杰 ', 'time': '3:40', 'song': ' 醉千年'}
{'rank': '14', 'singer': 'G.G(张思源 ) ', 'time': '3:16', 'song': ' 给陌生的你听'}
{'rank': '15', 'singer': '于果 ', 'time': '3:37', 'song': ' 侧脸'}
{'rank': '16', 'singer': '于文文 ', 'time': '4:42', 'song': ' 体面'}
{'rank': '17', 'singer': '胡夏、郁可唯 ', 'time': '4:36', 'song': ' 知否知否'}
{'rank': '18', 'singer': 'Ayo97、阿涵 ', 'time': '3:57', 'song': ' 感谢你曾来过'}
................
21.26.2. 爬取酷狗音乐飙升榜¶
#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/7/10 23:45
# filename: 爬取酷狗音乐飙升榜.py
import requests
from bs4 import BeautifulSoup
import time
import re
from lxml import etree
"""
使用3种方式爬取酷狗音乐的飙升榜
"""
url = "https://www.kugou.com/yy/html/rank.html"
def get_html(url):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
try:
response = requests.get(url, headers=headers)
html = response.text
return html
except:
print("request error")
def Re_serach(url):
"""
使用re正则抓取酷狗音乐
:param url:
:return:
"""
html_info = get_html(url)
info1 = re.compile('<li class=" " title="(.*?)">')
music_html = re.compile('<a href="(.*?)" data-active="playDwn" data-index="\d"')
music_info_all = info1.findall(html_info, re.S)
music_songs = music_html.findall(html_info, re.S)
for music, music_song in zip(music_info_all, music_songs):
info = {
"歌曲排名:": str(music).split("data-index=\"")[1],
"歌名": str(music).split("data-index=\"")[0],
"播放链接:": music_song
}
print(info)
def Beautifulsoup_search(url):
"""
使用Beautifulsoup抓取酷狗
:param url:
:return:
"""
html_info = get_html(url)
soup = BeautifulSoup(html_info, "lxml")
song_ids = soup.select("#rankWrap > div.pc_temp_songlist.pc_rank_songlist_short > ul > li > a")
# print(song_ids)
# song_ids = soup.select("#rankWrap > div.pc_temp_songlist.pc_rank_songlist_short > ul > li > a")
# song_id = re.findall("\"\d\"", str(song_id))
song_names = soup.select("#rankWrap > div.pc_temp_songlist.pc_rank_songlist_short > ul > li > a")
URL_songs = soup.select("#rankWrap > div.pc_temp_songlist.pc_rank_songlist_short > ul > li > a")
for song_id, song_name, URL_song in zip(song_ids, song_names, URL_songs):
data = {
"歌曲排行": int(song_id.get("data-index")) + 1,
"歌曲名称": song_name.get_text().strip(),
"歌曲链接": URL_song.get("href")
}
print(data)
def Xapth_select(url):
html = get_html(url)
selector = etree.HTML(html)
url_infos = selector.xpath('//*[@id="rankWrap"]/div[2]')
for urlinfo in url_infos:
song_names = urlinfo.xpath('ul/li/a/text()')
ids = urlinfo.xpath('ul/li/span[3][@ class="pc_temp_num"]/text()')
ids = [str(id).strip("\t").strip("\r").strip("\n").strip() for id in ids]
idss = [id for id in ids if id]
song_links = urlinfo.xpath('ul/li/a/@href')
for songnam, id, song_link in zip(song_names, idss, song_links):
data = {
"歌曲名称": songnam,
"歌曲排名": int(id) - 3,
"歌曲链接": song_link
}
print(data)
if __name__ == '__main__':
# Re_serach(url)
# Beautifulsoup_search(url)
Xapth_select(url)
输出信息
{'歌曲排名': 1, '歌曲名称': '郑冰冰 - 渡我不渡她', '歌曲链接': 'https://www.kugou.com/song/yc2xsff.html'}
{'歌曲排名': 2, '歌曲名称': '徐子崴 - 我想和你好好的', '歌曲链接': 'https://www.kugou.com/song/kfura8.html'}
{'歌曲排名': 3, '歌曲名称': '屈杨 - 有一种悲伤 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydvg058.html'}
{'歌曲排名': 4, '歌曲名称': '由博文 - Simon (Live)', '歌曲链接': 'https://www.kugou.com/song/ydvfjce.html'}
{'歌曲排名': 5, '歌曲名称': '肖蔷 - 无问西东 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydvdx58.html'}
{'歌曲排名': 6, '歌曲名称': '陈其楠 - 未来 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydvdz1e.html'}
{'歌曲排名': 7, '歌曲名称': 'CPU - 玫瑰玫瑰我爱你 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydvcaa5.html'}
{'歌曲排名': 8, '歌曲名称': '潘玮柏、SeanT肖恩恩、黄旭 - 爱你3000 (Live)', '歌曲链接': 'https://www.kugou.com/song/ye02b10.html'}
{'歌曲排名': 9, '歌曲名称': '田颖 - 寂寞难耐 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydve55d.html'}
{'歌曲排名': 10, '歌曲名称': '陈小同 - 贫穷或富有 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydveq9f.html'}
{'歌曲排名': 11, '歌曲名称': 'MC Hotdog、张震岳、Creamd、Capper - 改变 (Live)', '歌曲链接': 'https://www.kugou.com/song/ye02i9e.html'}
{'歌曲排名': 12, '歌曲名称': '卓玛殷措 - 爱是怀疑 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydwo481.html'}
{'歌曲排名': 13, '歌曲名称': '孙振宇 - 对他说我愿意 (Live)', '歌曲链接': 'https://www.kugou.com/song/ydvh16d.html'}
{'歌曲排名': 14, '歌曲名称': '杨和苏KeyNG、SeanT肖恩恩 - Put it up (Live)', '歌曲链接': 'https://www.kugou.com/song/y51r65f.html'}
{'歌曲排名': 15, '歌曲名称': '豆心 - 一个', '歌曲链接': 'https://www.kugou.com/song/lru4q16.html'}
{'歌曲排名': 16, '歌曲名称': '黄旭 - 孤独 (Live)', '歌曲链接': 'https://www.kugou.com/song/ye07g2f.html'}
{'歌曲排名': 17, '歌曲名称': '新秀 - 不负 (Live)', '歌曲链接': 'https://www.kugou.com/song/ye07d94.html'}
{'歌曲排名': 18, '歌曲名称': '于果 - 鸟儿飞', '歌曲链接': 'https://www.kugou.com/song/xb34d6b.html'}
{'歌曲排名': 19, '歌曲名称': '欢子 - 保重', '歌曲链接': 'https://www.kugou.com/song/ydvipe5.html'}