Scrapy爬虫Demo

xiaoxiao2025-10-24 9

#coding=utf-8 import scrapy import time import re from qqcrawler.items import QqcrawlerItem class QzoneSpider(scrapy.Spider): name = "qzone" # allowed_domains = ["qzone.qq.com/"] start_urls = [ # "http://www.ncst.edu.cn/" "http://qzone.qq.com/" # ,"http://www.qq.com/" ] def parse(self, response): try: qq_item = QqcrawlerItem() #爬取的数据 qq_item['c_time'] = time.time() qq_item['url'] = response.url if response.xpath('/html/head/title'): qq_item['title'] = response.xpath('/html/head/title').extract() else: qq_item['title']=None yield qq_item if response.xpath('//@href'): for i in response.xpath('//@href').extract(): if re.match('^http.*qzone\.qq.*',i): print i,'================' yield scrapy.Request(i, callback=self.parse) #继续向爬虫中添加url except: print ''

转载请注明原文地址: https://ju.6miu.com/read-1303483.html

最新回复(0)