我正在学习如何使用刮擦,并试图使一个爬虫刮网站链接和文本从它。我的爬虫为
http://quotes.toscrape.com/
和
http://books.toscrape.com/
但不是为了现实生活
https://pypi.org/project/wikipedia/
或者维基百科。我不明白是什么引起的。请帮帮我
霉菌
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor
from scrapy.utils.log import configure_logging
class firstSpider(scrapy.Spider):
name = "htmlcrawler"
start_urls = [
'https://pypi.org/project/wikipedia/',
]
def parse(self, response):
val1=response.css("p.text::text").extract_first()
val2=response.css("span.text::text").extract_first()
val3=response.css("pre.text::text").extract_first()
text = str("" if val3 is None else val3) + str("" if val2 is None else val2)+str("" if val1 is None else val1)
NEXT_PAGE_SELECTOR = '.next a ::attr(href)'
next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
print(next_page)
if next_page:
next_page = response.urljoin(next_page)
yield{'html':next_page,'text':text}
yield scrapy.Request(next_page, callback=self.parse)
def run():
settings = get_project_settings()
settings.set('FEED_FORMAT', 'json')
settings.set('FEED_URI', 'result.json')
settings.set('Depth_Limit',60)
settings.set('DOWNLOAD_DELAY',2)
settings.set('DUPEFILTER_CLASS','scrapy.dupefilters.BaseDupeFilter')
configure_logging()
runner = CrawlerRunner(settings)
runner.crawl(firstSpider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
if __name__=="__main__":
run()
我正从氢原子中挣扎。
编辑
我更改了dupe filter类,并尝试从
https://blog.siliconstraits.vn/building-web-crawler-scrapy/
但它仍然不起作用。