代码之家  ›  专栏  ›  技术社区  ›  Kiran JC

无法使用xpath查找或打印来自amazon的链接,但我可以使用beautifulsoup

  •  0
  • Kiran JC  · 技术社区  · 7 年前

    这是我尝试了很多方法的python脚本,但由于我不熟悉xpath,因此无法正常工作

    from lxml import html
    import csv,os,json
    import requests
    from exceptions import ValueError
    from time import sleep
    
    def AmzonParser(url):
        headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 
        (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
        page = requests.get(url,headers=headers)
        while True:
            sleep(3)
            try:
                doc = html.fromstring(page.content)
                XPATH_NAME = '//h1[@id="title"]//text()'
                XPATH_SALE_PRICE = '//span[contains(@id,"ourprice") or 
                contains(@id,"saleprice")]/text()'
                XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or 
                contains(text(),"M.R.P") or contains(text(),"Price")]/following-
                sibling::td/text()'
                XPATH_CATEGORY = '//a[@class="a-link-normal a-color-
                 tertiary"]//text()'
                XPATH_AVAILABILITY = '//div[@id="availability"]/span/text()'
                XPATH_DESCRIPTION = '///*[@id="productDescription"]/p/text()'
                XPATH_IMAGE = '//*[@id="main-image-
                container"]/ul/li[5]/span/span/div/img/src'
    
    
                RAW_NAME = doc.xpath(XPATH_NAME)
                RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE)
                RAW_CATEGORY = doc.xpath(XPATH_CATEGORY)
                RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE)
                RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
                RAW_DESCRIPTION = doc.xpath(XPATH_DESCRIPTION)
                RAW_IMAGE = doc.xpath(XPATH_IMAGE)
    
                NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
                SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if 
                RAW_SALE_PRICE else None
                CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if 
                RAW_CATEGORY else None
                ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if 
                RAW_ORIGINAL_PRICE else None
                AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY 
                else None
                DESCRIPTION = ''.join(RAW_DESCRIPTION).strip() if RAW_DESCRIPTION 
                else None
                IMAGE = ''.join(RAW_IMAGE) if RAW_IMAGE else None
    
                if not ORIGINAL_PRICE:
                    ORIGINAL_PRICE = SALE_PRICE
    
                if page.status_code!=200:
                    raise ValueError('captha')
                data = {
                        'NAME':NAME,
                        'SALE_PRICE':SALE_PRICE,
                        'CATEGORY':CATEGORY,
                        'ORIGINAL_PRICE':ORIGINAL_PRICE,
                        'AVAILABILITY':AVAILABILITY,
                        'URL':url,
                        'DESCRIPTION':DESCRIPTION,
                        'IMAGE':IMAGE,
                        }
    
                return data
            except Exception as e:
                print e
    
    def ReadAsin():
        # AsinList = 
        csv.DictReader(open(os.path.join(os.path.dirname(__file__),"Asinfeed.csv")))
        AsinList = ['B008HDREZ6',]
        extracted_data = []
        for i in AsinList:
            url = "http://www.amazon.com/dp/"+i
            print "Processing: "+url
            extracted_data.append(AmzonParser(url))
            sleep(5)
            f=open('data.json','w')
            json.dump(extracted_data,f,indent=4)
    
    
     if __name__ == "__main__":
         ReadAsin()
    

    我无法获取图像的链接

    这是html

    <div class="imgTagWrapper" style="height: 296px;">
      <img src="https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY450_.jpg" class="a-dynamic-image a-stretch-vertical" id="" style="max-height: 296px; max-width: 204.282px;" data-old-hires="https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SL1500_.jpg" data-a-manual-replacement="true">
    </div>
    
    1 回复  |  直到 7 年前
        1
  •  1
  •   furas    7 年前

    页面使用JavaScript将大图像放入此标记中。但是 lxml , beautifulsoup 无法运行JavaScript。

    具有 lxml / beautifulsoup公司 您只能使用 '//div[@id="altImages"]//img/@src' .

    您可以在其中一个 <script> 标签

    代码查找 <脚本(>); 具有 data["colorImages"] = 将数据转换为JSON字符串,并将其转换为Python字典,然后很容易获得各种不同大小图像的URL。

    import requests
    from lxml import html
    import json
    
    url = "http://www.amazon.com/dp/B008HDREZ6"
    
    headers = {
      'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers)
    doc = html.fromstring(response.content)
    
    print('--- small ---')
    XPATH_IMAGE = '//div[@id="altImages"]//img/@src'
    RAW_IMAGE = doc.xpath(XPATH_IMAGE)
    print('\n'.join(RAW_IMAGE[:-1]))
    
    print('--- scripts ---')
    XPATH_SCRIPTS = '//script'
    RAW_SCRIPTS = doc.xpath(XPATH_SCRIPTS)
    data = ''
    for script in RAW_SCRIPTS:
        text = script.text 
        if 'data["colorImages"]' in text:
            for line in text.splitlines():
                if 'data["colorImages"]' in line:
                    #print(line)
                    data = line
    
    
    print('--- data ---')
    data = data[24:-1]
    data = json.loads(data)
    
    print('keys:', data.keys())
    print('keys:', data['Silver'][0].keys())
    print('keys:', data['White'][0].keys())
    
    for item in data['Silver']:
        print('variant:', item['variant'])
        print('main:', item['main'])
        print('large:', item['large'])
        print('hiRes:', item['hiRes'])
        print('thumb:', item['thumb'])
        print('-----')
    

    拇指:

    --- small ---
    https://images-na.ssl-images-amazon.com/images/I/31bDT3JCmML._SS40_.jpg
    https://images-na.ssl-images-amazon.com/images/I/51OGAiwApNL._SS40_.jpg
    https://images-na.ssl-images-amazon.com/images/I/519%2B3tR1ObL._SS40_.jpg
    

    JavaScript中的数据:

    --- data ---
    keys: dict_keys(['Silver', 'White'])
    keys: dict_keys(['large', 'variant', 'hiRes', 'thumb', 'main'])
    keys: dict_keys(['large', 'variant', 'hiRes', 'thumb', 'main'])
    
    variant: MAIN
    main: {'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX355_.jpg': ['219', '355'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX522_.jpg': ['323', '522'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX450_.jpg': ['278', '450'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX466_.jpg': ['288', '466'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX425_.jpg': ['263', '425']}
    large: https://images-na.ssl-images-amazon.com/images/I/31bDT3JCmML.jpg
    hiRes: https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SL1500_.jpg
    thumb: https://images-na.ssl-images-amazon.com/images/I/31bDT3JCmML._SS40_.jpg
    -----
    variant: PT01
    main: {'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY550_.jpg': ['550', '380'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY355_.jpg': ['355', '245'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY679_.jpg': ['679', '469'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY450_.jpg': ['450', '311'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY606_.jpg': ['606', '419']}
    large: https://images-na.ssl-images-amazon.com/images/I/51OGAiwApNL.jpg
    hiRes: https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SL1500_.jpg
    thumb: https://images-na.ssl-images-amazon.com/images/I/51OGAiwApNL._SS40_.jpg
    -----
    variant: PT02
    main: {'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX466_.jpg': ['311', '466'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX522_.jpg': ['348', '522'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX450_.jpg': ['300', '450'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX425_.jpg': ['283', '425'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX355_.jpg': ['237', '355']}
    large: https://images-na.ssl-images-amazon.com/images/I/519%2B3tR1ObL.jpg
    hiRes: https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SL1500_.jpg
    thumb: https://images-na.ssl-images-amazon.com/images/I/519%2B3tR1ObL._SS40_.jpg
    -----