Added all

This commit is contained in:
Guillem Hernandez Sola
2026-05-19 11:45:52 +02:00
parent 97ff3cdb4a
commit a5ee10bd8b

View File

@@ -772,7 +772,6 @@ def _try_refresh_grid(page, max_attempts: int = 4) -> bool:
pass pass
return False return False
def _scrape_via_api(handle: str, cookies: list) -> list: def _scrape_via_api(handle: str, cookies: list) -> list:
""" """
Fallback: hit TikTok's internal item_list API directly with httpx. Fallback: hit TikTok's internal item_list API directly with httpx.
@@ -780,7 +779,6 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
""" """
logging.info(f"🌐 Trying TikTok API fallback for @{handle}...") logging.info(f"🌐 Trying TikTok API fallback for @{handle}...")
# Build a cookie header string from the injected cookies
cookie_header = "; ".join( cookie_header = "; ".join(
f"{c.get('name', '')}={c.get('value', '')}" f"{c.get('name', '')}={c.get('value', '')}"
for c in cookies for c in cookies
@@ -793,35 +791,34 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
"AppleWebKit/537.36 (KHTML, like Gecko) " "AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36" "Chrome/124.0.0.0 Safari/537.36"
), ),
"Referer": f"https://www.tiktok.com/@{handle}", "Referer": f"https://www.tiktok.com/@{handle}",
"Cookie": cookie_header, "Cookie": cookie_header,
"Accept": "application/json, text/plain, */*", "Accept": "application/json, text/plain, */*",
"Accept-Language": "es-ES,es;q=0.9", "Accept-Language": "es-ES,es;q=0.9",
} }
# Resolve the numeric user ID from the profile page HTML user_id, sec_uid = _resolve_tiktok_ids(handle, headers)
user_id = _resolve_tiktok_user_id(handle, headers)
if not user_id: if not user_id and not sec_uid:
logging.warning("⚠️ Could not resolve TikTok user ID for API fallback.") logging.warning("⚠️ Could not resolve TikTok user ID or secUid for API fallback.")
return [] return []
videos = [] videos = []
cursor = 0
try: try:
params = { params = {
"aid": "1988", "aid": "1988",
"app_name": "tiktok_web", "app_name": "tiktok_web",
"count": str(SCRAPE_VIDEO_LIMIT), "count": str(SCRAPE_VIDEO_LIMIT),
"cursor": str(cursor), "cursor": "0",
"secUid": "", "secUid": sec_uid or "",
"id": user_id, "id": user_id or "",
"type": "1", "type": "1",
"sourceType": "8", "sourceType": "8",
"appId": "1233", "appId": "1233",
"region": "ES", "region": "ES",
"priority_region": "ES", "priority_region": "ES",
"language": "es", "language": "es",
} }
resp = httpx.get( resp = httpx.get(
"https://www.tiktok.com/api/post/item_list/", "https://www.tiktok.com/api/post/item_list/",
@@ -833,11 +830,17 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
resp.raise_for_status() resp.raise_for_status()
data = resp.json() data = resp.json()
logging.info(
f"🌐 API response keys: {list(data.keys())}, "
f"statusCode: {data.get('statusCode')}, "
f"items: {len(data.get('itemList', []))}"
)
for item in data.get("itemList", []): for item in data.get("itemList", []):
try: try:
vid_id = item.get("id", "") vid_id = item.get("id", "")
desc = item.get("desc", "") desc = item.get("desc", "")
url = f"https://www.tiktok.com/@{handle}/video/{vid_id}" url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
videos.append({ videos.append({
"id": vid_id, "id": vid_id,
"url": url, "url": url,
@@ -856,8 +859,14 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
return videos return videos
def _resolve_tiktok_user_id(handle: str, headers: dict) -> str | None: def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]:
"""Extract the numeric TikTok user ID from the profile page HTML.""" """
Extract both the numeric user ID and secUid from the profile page HTML.
Returns (user_id, sec_uid) — either may be None.
"""
user_id = None
sec_uid = None
try: try:
resp = httpx.get( resp = httpx.get(
f"https://www.tiktok.com/@{handle}", f"https://www.tiktok.com/@{handle}",
@@ -865,21 +874,61 @@ def _resolve_tiktok_user_id(handle: str, headers: dict) -> str | None:
timeout=15, timeout=15,
follow_redirects=True, follow_redirects=True,
) )
# TikTok embeds user data in a __UNIVERSAL_DATA_FOR_REHYDRATION__ script tag html = resp.text
match = re.search(r'"uniqueId"\s*:\s*"[^"]+"\s*,\s*"id"\s*:\s*"(\d+)"', resp.text)
if match: # ── Numeric user ID ────────────────────────────────────────────
uid = match.group(1) id_patterns = [
logging.info(f"✅ Resolved TikTok user ID: {uid}") r'"authorId"\s*:\s*"(\d{15,25})"',
return uid r'"author"\s*:\s*\{[^}]*"id"\s*:\s*"(\d{15,25})"',
# Fallback pattern r'"userId"\s*:\s*"(\d{15,25})"',
match = re.search(r'"authorId"\s*:\s*"(\d+)"', resp.text) r'"uid"\s*:\s*"(\d{15,25})"',
if match: r'"ownerUid"\s*:\s*"(\d{15,25})"',
return match.group(1) r',"id":"(\d{15,25})","uniqueId":"' + re.escape(handle) + r'"',
r'"uniqueId":"' + re.escape(handle) + r'","id":"(\d{15,25})"',
]
for pattern in id_patterns:
m = re.search(pattern, html, re.IGNORECASE)
if m:
user_id = m.group(1)
logging.info(f"✅ Resolved TikTok user ID: {user_id}")
break
# ── secUid ─────────────────────────────────────────────────────
sec_patterns = [
r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"',
r'"authorSecId"\s*:\s*"([A-Za-z0-9_\-]{20,})"',
]
for pattern in sec_patterns:
m = re.search(pattern, html, re.IGNORECASE)
if m:
sec_uid = m.group(1)
logging.info(f"✅ Resolved TikTok secUid: {sec_uid[:30]}...")
break
if not user_id and not sec_uid:
# Window search fallback
handle_pos = html.find(f'"uniqueId":"{handle}"')
if handle_pos != -1:
window = html[max(0, handle_pos - 300): handle_pos + 300]
m = re.search(r'"id"\s*:\s*"(\d{15,25})"', window)
if m:
user_id = m.group(1)
logging.info(f"✅ Resolved TikTok user ID (window): {user_id}")
m = re.search(r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"', window)
if m:
sec_uid = m.group(1)
logging.info(f"✅ Resolved TikTok secUid (window): {sec_uid[:30]}...")
if not user_id and not sec_uid:
logging.warning(
f"⚠️ Could not resolve any TikTok ID for @{handle}. "
f"HTML length: {len(html)} chars."
)
except Exception as e: except Exception as e:
logging.warning(f"⚠️ Could not resolve TikTok user ID: {e}") logging.warning(f"⚠️ Could not resolve TikTok IDs: {e}")
return None
return user_id, sec_uid
def scrape_tiktoks_via_playwright(handle: str) -> list: def scrape_tiktoks_via_playwright(handle: str) -> list:
""" """
Scrape recent videos from a public TikTok profile. Scrape recent videos from a public TikTok profile.