23.3.12. 数据库:MongoDB篇

① Mongodb的简单使用示例(无需提前定义字段,比sqlite和mysql方便)

py连接mongodb的一个小测试脚本

conn_mongodb.py

#!/usr/bin/env python
# -*- coding:utf8 -*-
# auther; 18793
# Date:2019/8/13 10:41
# filename: conn_mongoDB.py

import pymongo

# 连接MongoDB,得到一个客户端对象
client = pymongo.MongoClient('mongodb://localhost:27017')  # 方式一
# client = pymongo.MongoClient('localhost':27017)           #方式二

# 建立名为srapydb的数据库
db = client.scrapydb  # 方式一
# db = client['scrapydb']    #方式二

# 建立名为person的数据表
collection = db.person  # 方式一
# collection = db['person']    #方式二

doc = {
    'name': '李小龙',
    'age': 23,
    'sex': 'M',
}

# 插入一条数据到集合
collection.insert_one(doc)

# 关闭客户端
client.close()

与前两章一样,实现存入1000条数据。

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


class BooksPipeline(object):
    review_rating_map = {
        'One': 1,
        'Two': 2,
        'Three': 3,
        'Four': 4,
        'Five': 5
    }

    def process_item(self, item, spider):
        # rating = item.get('review_rating')  #获取review_rating的数据
        rating = item['review_rating']  # 与上面的语句等价
        item['review_rating'] = self.review_rating_map[rating]

        return item


##简单设置方式,在setting.py中开通pipeline即可。
import pymongo


class PymongoPipeline(object):
    def __init__(self):
        client = pymongo.MongoClient('localhost', 27017)
        db = client['scrapydb']
        books = db['books']
        self.post = books  ##连接数据库

    def process_item(self, item, spider):
        info = dict(item)
        self.post.insert(info)  ##插入数据库
        return item

    def close_spider(self, spider):
        self.client.close()

settings.py

ITEM_PIPELINES = {
   'books.pipelines.BooksPipeline': 300,
    'books.pipelines.PymongoPipeline': 402,
}

运行结果如下: image1

当然也可以与前两章一样,设置比较完整的内容: pipelines.py

##完整设置方式
import pymongo

class PymongoPipeline2(object):

    # 打开数据库
    def open_spider(self, spider):
        db_uri = spider.settings.get('MONGODB_URI', 'mongodb://localhost:27017')
        db_name = spider.settings.get('MONGODB_DB_NAME', 'scrapy_default')

        self.db_client = pymongo.MongoClient('mongodb://localhost:27017')
        self.db = self.db_client[db_name]

    # 关闭数据库
    def close_spider(self, spider):
        self.db_client.close()

    # 对数据进行处理
    def process_item(self, item, spider):
        self.insert_db(item)
        return item

    # 插入数据
    def insert_db(self, item):
        info = dict(item)
        self.db.books.insert_one(info)

setting.py

MONGODB_URI = 'mongodb://localhost:27017'
MONGODB_DB_NAME = 'scrapydb'

ITEM_PIPELINES = {
   'books.pipelines.BooksPipeline': 300,
    'books.pipelines.PymongoPipeline2': 403,
}