diff --git a/tiktok2bsky.py b/tiktok2bsky.py index aee2f37..b473ae9 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -772,7 +772,6 @@ def _try_refresh_grid(page, max_attempts: int = 4) -> bool: pass return False - def _scrape_via_api(handle: str, cookies: list) -> list: """ Fallback: hit TikTok's internal item_list API directly with httpx. @@ -780,7 +779,6 @@ def _scrape_via_api(handle: str, cookies: list) -> list: """ logging.info(f"🌐 Trying TikTok API fallback for @{handle}...") - # Build a cookie header string from the injected cookies cookie_header = "; ".join( f"{c.get('name', '')}={c.get('value', '')}" for c in cookies @@ -793,35 +791,34 @@ def _scrape_via_api(handle: str, cookies: list) -> list: "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/124.0.0.0 Safari/537.36" ), - "Referer": f"https://www.tiktok.com/@{handle}", - "Cookie": cookie_header, - "Accept": "application/json, text/plain, */*", + "Referer": f"https://www.tiktok.com/@{handle}", + "Cookie": cookie_header, + "Accept": "application/json, text/plain, */*", "Accept-Language": "es-ES,es;q=0.9", } - # Resolve the numeric user ID from the profile page HTML - user_id = _resolve_tiktok_user_id(handle, headers) - if not user_id: - logging.warning("⚠️ Could not resolve TikTok user ID for API fallback.") + user_id, sec_uid = _resolve_tiktok_ids(handle, headers) + + if not user_id and not sec_uid: + logging.warning("⚠️ Could not resolve TikTok user ID or secUid for API fallback.") return [] videos = [] - cursor = 0 try: params = { - "aid": "1988", - "app_name": "tiktok_web", - "count": str(SCRAPE_VIDEO_LIMIT), - "cursor": str(cursor), - "secUid": "", - "id": user_id, - "type": "1", - "sourceType": "8", - "appId": "1233", - "region": "ES", + "aid": "1988", + "app_name": "tiktok_web", + "count": str(SCRAPE_VIDEO_LIMIT), + "cursor": "0", + "secUid": sec_uid or "", + "id": user_id or "", + "type": "1", + "sourceType": "8", + "appId": "1233", + "region": "ES", "priority_region": "ES", - "language": "es", + "language": "es", } resp = httpx.get( "https://www.tiktok.com/api/post/item_list/", @@ -833,11 +830,17 @@ def _scrape_via_api(handle: str, cookies: list) -> list: resp.raise_for_status() data = resp.json() + logging.info( + f"🌐 API response keys: {list(data.keys())}, " + f"statusCode: {data.get('statusCode')}, " + f"items: {len(data.get('itemList', []))}" + ) + for item in data.get("itemList", []): try: - vid_id = item.get("id", "") - desc = item.get("desc", "") - url = f"https://www.tiktok.com/@{handle}/video/{vid_id}" + vid_id = item.get("id", "") + desc = item.get("desc", "") + url = f"https://www.tiktok.com/@{handle}/video/{vid_id}" videos.append({ "id": vid_id, "url": url, @@ -856,8 +859,14 @@ def _scrape_via_api(handle: str, cookies: list) -> list: return videos -def _resolve_tiktok_user_id(handle: str, headers: dict) -> str | None: - """Extract the numeric TikTok user ID from the profile page HTML.""" +def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]: + """ + Extract both the numeric user ID and secUid from the profile page HTML. + Returns (user_id, sec_uid) — either may be None. + """ + user_id = None + sec_uid = None + try: resp = httpx.get( f"https://www.tiktok.com/@{handle}", @@ -865,21 +874,61 @@ def _resolve_tiktok_user_id(handle: str, headers: dict) -> str | None: timeout=15, follow_redirects=True, ) - # TikTok embeds user data in a __UNIVERSAL_DATA_FOR_REHYDRATION__ script tag - match = re.search(r'"uniqueId"\s*:\s*"[^"]+"\s*,\s*"id"\s*:\s*"(\d+)"', resp.text) - if match: - uid = match.group(1) - logging.info(f"✅ Resolved TikTok user ID: {uid}") - return uid - # Fallback pattern - match = re.search(r'"authorId"\s*:\s*"(\d+)"', resp.text) - if match: - return match.group(1) + html = resp.text + + # ── Numeric user ID ──────────────────────────────────────────── + id_patterns = [ + r'"authorId"\s*:\s*"(\d{15,25})"', + r'"author"\s*:\s*\{[^}]*"id"\s*:\s*"(\d{15,25})"', + r'"userId"\s*:\s*"(\d{15,25})"', + r'"uid"\s*:\s*"(\d{15,25})"', + r'"ownerUid"\s*:\s*"(\d{15,25})"', + r',"id":"(\d{15,25})","uniqueId":"' + re.escape(handle) + r'"', + r'"uniqueId":"' + re.escape(handle) + r'","id":"(\d{15,25})"', + ] + for pattern in id_patterns: + m = re.search(pattern, html, re.IGNORECASE) + if m: + user_id = m.group(1) + logging.info(f"✅ Resolved TikTok user ID: {user_id}") + break + + # ── secUid ───────────────────────────────────────────────────── + sec_patterns = [ + r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"', + r'"authorSecId"\s*:\s*"([A-Za-z0-9_\-]{20,})"', + ] + for pattern in sec_patterns: + m = re.search(pattern, html, re.IGNORECASE) + if m: + sec_uid = m.group(1) + logging.info(f"✅ Resolved TikTok secUid: {sec_uid[:30]}...") + break + + if not user_id and not sec_uid: + # Window search fallback + handle_pos = html.find(f'"uniqueId":"{handle}"') + if handle_pos != -1: + window = html[max(0, handle_pos - 300): handle_pos + 300] + m = re.search(r'"id"\s*:\s*"(\d{15,25})"', window) + if m: + user_id = m.group(1) + logging.info(f"✅ Resolved TikTok user ID (window): {user_id}") + m = re.search(r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"', window) + if m: + sec_uid = m.group(1) + logging.info(f"✅ Resolved TikTok secUid (window): {sec_uid[:30]}...") + + if not user_id and not sec_uid: + logging.warning( + f"⚠️ Could not resolve any TikTok ID for @{handle}. " + f"HTML length: {len(html)} chars." + ) + except Exception as e: - logging.warning(f"⚠️ Could not resolve TikTok user ID: {e}") - return None - + logging.warning(f"⚠️ Could not resolve TikTok IDs: {e}") + return user_id, sec_uid def scrape_tiktoks_via_playwright(handle: str) -> list: """ Scrape recent videos from a public TikTok profile.