Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,16 @@
'Upgrade-Insecure-Requests': '1',
}

# --- Random User Agents for Anti-Detection ---
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
]

# --- Client Initialization ---
# 检查配置是否齐全
if not all([BASE_URL, MODEL_NAME]):
Expand Down Expand Up @@ -92,3 +102,11 @@ def get_ai_request_params(**kwargs):
if ENABLE_THINKING:
kwargs["extra_body"] = {"enable_thinking": False}
return kwargs


def get_random_user_agent():
"""
获取随机User-Agent用于反检测
"""
import random
return random.choice(USER_AGENTS)
12 changes: 8 additions & 4 deletions src/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
RUN_HEADLESS,
RUNNING_IN_DOCKER,
STATE_FILE,
get_random_user_agent,
)
from src.parsers import (
_parse_search_results_json,
Expand Down Expand Up @@ -183,7 +184,10 @@ async def scrape_xianyu(task_config: dict, debug_limit: int = 0):
browser = await p.chromium.launch(headless=RUN_HEADLESS)
else:
browser = await p.chromium.launch(headless=RUN_HEADLESS, channel="chrome")
context = await browser.new_context(storage_state=STATE_FILE, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
# 使用随机User-Agent增强反检测能力
random_ua = get_random_user_agent()
print(f" [反检测] 使用随机User-Agent: {random_ua}")
context = await browser.new_context(storage_state=STATE_FILE, user_agent=random_ua)
page = await context.new_page()

try:
Expand Down Expand Up @@ -343,7 +347,7 @@ async def scrape_xianyu(task_config: dict, debug_limit: int = 0):
if "FAIL_SYS_USER_VALIDATE" in ret_string:
print("\n==================== CRITICAL BLOCK DETECTED ====================")
print("检测到闲鱼反爬虫验证 (FAIL_SYS_USER_VALIDATE),程序将终止。")
long_sleep_duration = random.randint(3, 60)
long_sleep_duration = random.randint(60, 180) # 增加到1-3分钟
print(f"为避免账户风险,将执行一次长时间休眠 ({long_sleep_duration} 秒) 后再退出...")
await asyncio.sleep(long_sleep_duration)
print("长时间休眠结束,现在将安全退出。")
Expand Down Expand Up @@ -468,7 +472,7 @@ async def scrape_xianyu(task_config: dict, debug_limit: int = 0):

# --- 修改: 增加单个商品处理后的主要延迟 ---
log_time("[反爬] 执行一次主要的随机延迟以模拟用户浏览间隔...")
await random_sleep(15, 30) # 原来是 (8, 15),这是最重要的修改之一
await random_sleep(25, 45) # 增加页面间延迟
else:
print(f" 错误: 获取商品详情API响应失败,状态码: {detail_response.status}")
if AI_DEBUG_MODE:
Expand All @@ -491,7 +495,7 @@ async def scrape_xianyu(task_config: dict, debug_limit: int = 0):
# --- 新增: 在处理完一页所有商品后,翻页前,增加一个更长的“休息”时间 ---
if not stop_scraping and page_num < max_pages:
print(f"--- 第 {page_num} 页处理完毕,准备翻页。执行一次页面间的长时休息... ---")
await random_sleep(25, 50)
await random_sleep(35, 60) # 增加页面处理完毕后的长延迟

except PlaywrightTimeoutError as e:
print(f"\n操作超时错误: 页面元素或网络响应未在规定时间内出现。\n{e}")
Expand Down