由于Scrapy的异步特性,您甚至不能确定响应将以与发出响应相同的顺序到达回调。您可以做的是获取URL列表,并将其传递给
meta
,并按如下顺序访问URL:
def parse(self, response):
urls = [
instance.url for instance in LinkExtractor(
allow_domains='example.com'
).extract_links(response)
]
try:
return Request(urls.pop(), callback=self.parse_links, meta={'urls': urls})
except IndexError:
pass
def parse_links(self, response):
item = EmailScraperItem()
mailrex = '[\w\.-]+@[\w\.-]+'
result = response.xpath('//a[@href]').re('%s' % mailrex)
if result:
item['emails'] = result
return item
try:
urls = response.meta['urls']
return Request(urls.pop(), callback=self.parse_links, meta={'urls': urls})
except IndexError:
pass