京东的反爬做的太厉害了,没办法一劳永逸,如果需要定制直接看我的个人简介或者私我。

话不多说,先上代码,缺少的库自己pip install一下。如果下面方式不行了,直接找我。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time
import openpyxl
from urllib.parse import unquote
import time
 
# 初始化WebDriver
browser = webdriver.Chrome()
# 访问商品页面
browser.get('https://item.jd.com/100034710036.html')
 
time.sleep(40) #留给你登录的时间
 
text=""
 
number=0;
 
#商品ID的数组
INDEX = ["100002585808", "3972915", "4642692", "100075885857", "100078384502",
         "1803367119", "100092728232", "100097665850","100092728228","100086968888"]
 
 
#接下来确定商品ID和页码,以及最终请求网址
for good in range(0,9):
    ID=INDEX[0]
    for page in range(0,99):
        url = ('https://api.m.jd.com/?appid=item-v3&functionId=pc_club_productPageComments&client=pc&clientVersion=1.0.0&t=1715482279709&body=%7B%22productId%22%3A'+str(ID)+
               '%2C%22score%22%3A0%2C%22sortType%22%3A5%2C%22page%22%3A'+str(page)+
               '%2C%22pageSize%22%3A10%2C%22isShadowSku%22%3A0%2C%22fold%22%3A1%2C%22bbtf%22%3A%22%22%2C%22shield%22%3A%22%22%7D&h5st=20240512105119715%3Bmmm5minyt95mmzz5%3Bfb5df%3Btk03wb78c1c4618nrKeeYdJ2yjayrF2-WfMePLS-kcojwjDSOi10g2EZndGr4-cxfURV7Ll_ArBAbTKiy8IqH7wW-eQs%3B41e1a99554f7dab734b6b0b9f69301a7514839b964f41d749f6c64941bb3124b%3B4.7%3B1715482279715%3BVadcfHFC9dy7MqhXZZXLTG-a4-3cg23rShjGz_Kk96MTmErl2-7IhOCMSl21mSj8zk27ilRb75YqTXZkGzXYNtjBnMHLy_vZO8ggVv_D5EyaGB0DGG-SBLfB39gh9jA23mL2ZxOXwGqWKZV3W-XqFrD_ft-_RpDa03OIWynaWUwcM9R5rO_vztIMefwyk2uPR_tQwoFyY1tT4KuLEjuLYXK3hoUYrlKqWAk3wai4XhbiESMEpwFV8plxlC77dP-zGMWshp3d9tvAWayk4m1JFNf5O1xhX_SaLJNRdTnoh98FS_EX_qTLyaOIo2afn53PMGl_flbCdgJUZflR_P7IGAV9VRme1L-wD3Kfl6j9W22zVo0MdL3r1S8fdwjPeEliIpepx0BI9PVSQtyt5b0klLoRDAxhSHrg4EgwkaTHc9IIdJkrQxfFTvmEcbKJvfrOxx_iP73CtCqbuZq0wsRXtYkISzxjFx9jRPAv_yAgJo-8lSIP5qHtzgzX7XY3DxZUueMlr33fS_Eq7fUmvwKyx6K8fY_p5xIpUsrVxOLCu7nZggE7nDk8PeheJO0dl8zjLad9Prk3hGJ0DQIeqffFGvzEemLTD52YgeDqWQHLXbk3&x-api-eid-token=jdd037ZMLAWEQ5ADNR56X2CYQ4UNYPG4GE3VPD6CACHL7XJNEDVC3OUTPKYWZ2M3E4AEIPEN7BT3YM6PVWSFHQQJAZMA7FYAAAAMPNK2KJDIAAAAADG72D47XIWI77IX&loginType=3&uuid=181111935.17140969595451523957337.1714096959.1715451172.1715480846.8')
        print(url)
        browser.get(url)
        time.sleep(1)
        try:
            WebDriverWait(browser, 30).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
            )
            # 获取页面的HTML内容
            page_html = browser.page_source
 
            # 使用BeautifulSoup解析HTML
            soup = BeautifulSoup(page_html, 'html.parser')
 
            # 提取并打印网页上的所有文本
            text = soup.get_text(separator=' ', strip=True)
            print(text)
        finally:
            # 这里什么也没有,但添加注释以提高可读性
            pass  # 没有清理工作需要执行
 
        #如果遇到反爬需要返回验证,要求选图片验证,30秒留给你通过验证
        if "验证一下" in text or "遭到拒绝" in text:
            browser.get("https://item.jd.com/10086509666149.html#comment")
            time.sleep(30)
            #这里再次访问刚才的网址
            browser.get(url)
            time.sleep(1)
            try:
                WebDriverWait(browser, 30).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
                )
                # 获取页面的HTML内容
                page_html = browser.page_source
 
                # 使用BeautifulSoup解析HTML
                soup = BeautifulSoup(page_html, 'html.parser')
 
                # 提取并打印网页上的所有文本
                text = soup.get_text(separator=' ', strip=True)
                print(text)
            finally:
                # 这里什么也没有,但添加注释以提高可读性
                pass  # 没有清理工作需要执行
 
        filename = str(INDEX[good])+str(page)+".txt"
 
        # 使用 'w' 模式打开文件,如果文件不存在,会自动创建
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(text)
 
        print(f"文本内容已保存到 {filename} 文件中。")
 
        number+=1
        print(number)
Logo

DAMO开发者矩阵,由阿里巴巴达摩院和中国互联网协会联合发起,致力于探讨最前沿的技术趋势与应用成果,搭建高质量的交流与分享平台,推动技术创新与产业应用链接,围绕“人工智能与新型计算”构建开放共享的开发者生态。

更多推荐