Contents
23.3.7. 模拟登陆webscraping.com网站¶
爬取网址:http://example.webscraping.com
代码示例:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import FormRequest
class LoginSpider(scrapy.Spider):
name = 'login'
allowed_domains = ['example.webscraping.com']
start_urls = ['http://example.webscraping.com/places/default/user/profile']
##-------------------------------进行登录-------------------------------
# 登录URL
login_url = "http://example.webscraping.com/places/default/user/login"
# 改写start_requests方法
def start_requests(self):
yield scrapy.Request(self.login_url, callback=self.login)
# 登录页面的信息处理
def login(self, response):
form_data = {'email': '1879324764@qq.com', 'password': 'admin#123'}
yield FormRequest.from_response(response, formdata=form_data, callback=self.parse_login)
# 登录成功后,会自动抓取start_urls中的网址,并用parse方法解析。
def parse_login(self, response):
if "欢迎 jianli" in response.text:
yield from super().start_requests() # 继承基类的start_requests方法,处理完会自动跳转到parse方法。
##-------------------------------登录后-------------------------------
# 登录后的信息解析工作
def parse(self, response):
keys = response.xpath('//td[@class="w2p_fl"]/label/text()').re('(.*?):')
values = response.xpath('//td[@class="w2p_fw"]/text()').extract()
yield dict(zip(keys, values))
# 导入CrawlerProcess类
from scrapy.crawler import CrawlerProcess
# 获取项目的设置信息
from scrapy.utils.project import get_project_settings
if __name__ == '__main__':
# 创建CrawlerProcess类对象,并将获取的设置信息传入
process = CrawlerProcess(get_project_settings())
# 设置需要启动的爬虫名称
process.crawl('login')
# 启动爬虫
process.start()