import scrapy
import time
import re
from qqcrawler.items
import QqcrawlerItem
class QzoneSpider(scrapy.Spider):
name =
"qzone"
start_urls = [
"http://qzone.qq.com/"
]
def parse(self, response):
try:
qq_item = QqcrawlerItem()
qq_item[
'c_time'] = time.time()
qq_item[
'url'] = response.url
if response.xpath(
'/html/head/title'):
qq_item[
'title'] = response.xpath(
'/html/head/title').extract()
else:
qq_item[
'title']=
None
yield qq_item
if response.xpath(
'//@href'):
for i
in response.xpath(
'//@href').extract():
if re.match(
'^http.*qzone\.qq.*',i):
print i,
'================'
yield scrapy.Request(i, callback=self.parse)
except:
print ''
转载请注明原文地址: https://ju.6miu.com/read-1303483.html