【资料图】
# 蜘蛛程序import scrapyfrom scrapy import Selectorfrom ..items import GetItem # 导入class BiliSpider(scrapy.Spider): name = 'bili' allowed_domains = ['bilibili.com'] start_urls = ['https://www.bilibili.com/'] # 爬取的页面 def parse(self, response): sel = Selector(response) list_items = sel.xpath('/html/body/div[2]/div[2]/main/div[2]/div/div[1]/div') for list_item in list_items: spider_item = GetItem() spider_item['title'] = list_item.css('h3::attr(title)').extract() # 标题 spider_item['author'] = list_item.css('span.bili-video-card__info--author::text').extract() # 作者 spider_item['time'] = list_item.css('span.bili-video-card__info--date::text').extract() # 时间 spider_item['link'] = list_item.css('h3 > a::attr(href)').extract() # 链接 yield spider_item
# items文件
# Define here the models for your scraped items## See documentation in:# https://docs.scrapy.org/en/latest/topics/items.htmlimport scrapyclass GetItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() author = scrapy.Field() time = scrapy.Field() link = scrapy.Field()
# 配置
在setting文件中打开cookies,添加请求头
# 命令行启动,保存为csv文件
scrapy crawl bili -o bili.csv