Contents
23.3.12. 数据库:MongoDB篇¶
① Mongodb的简单使用示例(无需提前定义字段,比sqlite和mysql方便)
py连接mongodb的一个小测试脚本
conn_mongodb.py
#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/8/13 10:41
# filename: conn_mongoDB.py
import pymongo
# 连接MongoDB,得到一个客户端对象
client = pymongo.MongoClient('mongodb://localhost:27017') # 方式一
# client = pymongo.MongoClient('localhost':27017) #方式二
# 建立名为srapydb的数据库
db = client.scrapydb # 方式一
# db = client['scrapydb'] #方式二
# 建立名为person的数据表
collection = db.person # 方式一
# collection = db['person'] #方式二
doc = {
'name': '李小龙',
'age': 23,
'sex': 'M',
}
# 插入一条数据到集合
collection.insert_one(doc)
# 关闭客户端
client.close()
与前两章一样,实现存入1000条数据。
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class BooksPipeline(object):
review_rating_map = {
'One': 1,
'Two': 2,
'Three': 3,
'Four': 4,
'Five': 5
}
def process_item(self, item, spider):
# rating = item.get('review_rating') #获取review_rating的数据
rating = item['review_rating'] # 与上面的语句等价
item['review_rating'] = self.review_rating_map[rating]
return item
##简单设置方式,在setting.py中开通pipeline即可。
import pymongo
class PymongoPipeline(object):
def __init__(self):
client = pymongo.MongoClient('localhost', 27017)
db = client['scrapydb']
books = db['books']
self.post = books ##连接数据库
def process_item(self, item, spider):
info = dict(item)
self.post.insert(info) ##插入数据库
return item
def close_spider(self, spider):
self.client.close()
settings.py
ITEM_PIPELINES = {
'books.pipelines.BooksPipeline': 300,
'books.pipelines.PymongoPipeline': 402,
}
运行结果如下: 
当然也可以与前两章一样,设置比较完整的内容: pipelines.py
##完整设置方式
import pymongo
class PymongoPipeline2(object):
# 打开数据库
def open_spider(self, spider):
db_uri = spider.settings.get('MONGODB_URI', 'mongodb://localhost:27017')
db_name = spider.settings.get('MONGODB_DB_NAME', 'scrapy_default')
self.db_client = pymongo.MongoClient('mongodb://localhost:27017')
self.db = self.db_client[db_name]
# 关闭数据库
def close_spider(self, spider):
self.db_client.close()
# 对数据进行处理
def process_item(self, item, spider):
self.insert_db(item)
return item
# 插入数据
def insert_db(self, item):
info = dict(item)
self.db.books.insert_one(info)
setting.py
MONGODB_URI = 'mongodb://localhost:27017'
MONGODB_DB_NAME = 'scrapydb'
ITEM_PIPELINES = {
'books.pipelines.BooksPipeline': 300,
'books.pipelines.PymongoPipeline2': 403,
}