第一步:

创建项目

scrapy stratproject [name]

如 scrapy startproject maoyan

第二步:

进入到项目的文件夹目录创建APP

scrapy gensider movie maoyan.com

第三步:

配置movie.py文件

import scrapy
from maoyan.items import MaoyanItem


class MovieSpider(scrapy.Spider):
    name = 'movie'
    allowed_domains = ['maoyan.com']
    start_urls = ['https://maoyan.com/board/4']

    def parse(self, response):
        movies = response.xpath('//dd')
        for movie_item in movies:
            item = MaoyanItem()
            item['title'] = movie_item.xpath('.//p/a/@title').extract_first()
            item['actor'] = movie_item.xpath('.//p[@class="star"]/text()').extract_first().strip()
            item['time'] = movie_item.xpath('.//p[@class="releasetime"]/text()').extract_first()
            yield item

        next_url = response.xpath('//a[text()="下一页"]/@href').extract_first()
        url = response.urljoin(next_url)
        yield scrapy.Request(url=url, callback=self.parse)

第四步:

配置items.py文件

import scrapy


class MaoyanItem(scrapy.Item):
    title = scrapy.Field()
    actor = scrapy.Field()
    time = scrapy.Field()

第五步:

配置pipelines.py文件

import pymongo


class MongoPipeline(object):
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def process_item(self, item, spider):
        name = item.__class__.__name__
        self.db[name].insert(dict(item))
        return item

    def close_spider(self, spider):
        self.client.close()

第六步:

配置settings.py文件

# -*- coding: utf-8 -*-

# Scrapy settings for maoyan project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'maoyan'

SPIDER_MODULES = ['maoyan.spiders']
NEWSPIDER_MODULE = 'maoyan.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

ITEM_PIPELINES = {'maoyan.pipelines.MongoPipeline': 400,
}

MONGO_URI = 'mongodb://admin:123456@localhost/'
MONGO_DB = 'maoyan'

第七步:

运行项目

scrapy crawl movie

02-14 03:05