请以此代码段为例,然后使用自定义函数对其进行扩展:
from time import sleep
from urllib.parse import urljoin
import requests
from lxml import html
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch, br",
"Accept-Language": "en-US,en;q=0.8",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}
proxies = {
'http': 'http://95.167.116.116:8080',
'https': 'http://88.157.149.250:8080',
}
links = []
url = 'https://www.amazon.com/s/ref=sr_pg_1?fst=as%3Aoff&rh=n%3A3375251%2Cn%3A%213375301%2Cn%3A10971181011%2Cn%3A11444071011%2Cp_8%3A2229059011%2Cn%3A11444072011%2Cn%3A11444086011%2Cn%3A2632268011&bbn=11444086011&ie=UTF8&qid=1517831374'
while True:
try:
print('Fetching url [%s]...' % url)
response = requests.get(url, headers=headers, stream=True)
if response.status_code == 200:
source = html.fromstring(response.content)
links.extend(source.xpath('//*[contains(@id,"result")]/div/div[3]/div[1]/a/@href'))
try:
next_url = source.xpath('//*[@id="pagnNextLink"]/@href')[0]
url = urljoin('https://www.amazon.com', next_url)
except IndexError:
break
except Exception:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
sleep(5)
print("Was a nice sleep, now let me continue...")
print(links)
实际上,它会从当前页面中获取下一页的链接。如果可以找到下一页的url,则如下所示。如果找不到,则断开
while
循环,并打印收集的
links
列表
希望有帮助。