Added all

2026-05-19 11:45:52 +02:00
parent 97ff3cdb4a
commit a5ee10bd8b
1 changed files with 88 additions and 39 deletions
--- a/tiktok2bsky.py
+++ b/tiktok2bsky.py
@@ -772,7 +772,6 @@ def _try_refresh_grid(page, max_attempts: int = 4) -> bool:
            pass
    return False

-
 def _scrape_via_api(handle: str, cookies: list) -> list:
    """
    Fallback: hit TikTok's internal item_list API directly with httpx.
@@ -780,7 +779,6 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
    """
    logging.info(f"🌐 Trying TikTok API fallback for @{handle}...")

-    # Build a cookie header string from the injected cookies
    cookie_header = "; ".join(
        f"{c.get('name', '')}={c.get('value', '')}"
        for c in cookies
@@ -793,35 +791,34 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/124.0.0.0 Safari/537.36"
        ),
-        "Referer":        f"https://www.tiktok.com/@{handle}",
-        "Cookie":         cookie_header,
-        "Accept":         "application/json, text/plain, */*",
+        "Referer":         f"https://www.tiktok.com/@{handle}",
+        "Cookie":          cookie_header,
+        "Accept":          "application/json, text/plain, */*",
        "Accept-Language": "es-ES,es;q=0.9",
    }

-    # Resolve the numeric user ID from the profile page HTML
-    user_id = _resolve_tiktok_user_id(handle, headers)
-    if not user_id:
-        logging.warning("⚠️ Could not resolve TikTok user ID for API fallback.")
+    user_id, sec_uid = _resolve_tiktok_ids(handle, headers)
+
+    if not user_id and not sec_uid:
+        logging.warning("⚠️ Could not resolve TikTok user ID or secUid for API fallback.")
        return []

    videos = []
-    cursor = 0

    try:
        params = {
-            "aid":          "1988",
-            "app_name":     "tiktok_web",
-            "count":        str(SCRAPE_VIDEO_LIMIT),
-            "cursor":       str(cursor),
-            "secUid":       "",
-            "id":           user_id,
-            "type":         "1",
-            "sourceType":   "8",
-            "appId":        "1233",
-            "region":       "ES",
+            "aid":             "1988",
+            "app_name":        "tiktok_web",
+            "count":           str(SCRAPE_VIDEO_LIMIT),
+            "cursor":          "0",
+            "secUid":          sec_uid or "",
+            "id":              user_id or "",
+            "type":            "1",
+            "sourceType":      "8",
+            "appId":           "1233",
+            "region":          "ES",
            "priority_region": "ES",
-            "language":     "es",
+            "language":        "es",
        }
        resp = httpx.get(
            "https://www.tiktok.com/api/post/item_list/",
@@ -833,11 +830,17 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
        resp.raise_for_status()
        data = resp.json()

+        logging.info(
+            f"🌐 API response keys: {list(data.keys())}, "
+            f"statusCode: {data.get('statusCode')}, "
+            f"items: {len(data.get('itemList', []))}"
+        )
+
        for item in data.get("itemList", []):
            try:
-                vid_id  = item.get("id", "")
-                desc    = item.get("desc", "")
-                url     = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
+                vid_id = item.get("id", "")
+                desc   = item.get("desc", "")
+                url    = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
                videos.append({
                    "id":        vid_id,
                    "url":       url,
@@ -856,8 +859,14 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
    return videos


-def _resolve_tiktok_user_id(handle: str, headers: dict) -> str | None:
-    """Extract the numeric TikTok user ID from the profile page HTML."""
+def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]:
+    """
+    Extract both the numeric user ID and secUid from the profile page HTML.
+    Returns (user_id, sec_uid) — either may be None.
+    """
+    user_id = None
+    sec_uid = None
+
    try:
        resp = httpx.get(
            f"https://www.tiktok.com/@{handle}",
@@ -865,21 +874,61 @@ def _resolve_tiktok_user_id(handle: str, headers: dict) -> str | None:
            timeout=15,
            follow_redirects=True,
        )
-        # TikTok embeds user data in a __UNIVERSAL_DATA_FOR_REHYDRATION__ script tag
-        match = re.search(r'"uniqueId"\s*:\s*"[^"]+"\s*,\s*"id"\s*:\s*"(\d+)"', resp.text)
-        if match:
-            uid = match.group(1)
-            logging.info(f"✅ Resolved TikTok user ID: {uid}")
-            return uid
-        # Fallback pattern
-        match = re.search(r'"authorId"\s*:\s*"(\d+)"', resp.text)
-        if match:
-            return match.group(1)
+        html = resp.text
+
+        # ── Numeric user ID ────────────────────────────────────────────
+        id_patterns = [
+            r'"authorId"\s*:\s*"(\d{15,25})"',
+            r'"author"\s*:\s*\{[^}]*"id"\s*:\s*"(\d{15,25})"',
+            r'"userId"\s*:\s*"(\d{15,25})"',
+            r'"uid"\s*:\s*"(\d{15,25})"',
+            r'"ownerUid"\s*:\s*"(\d{15,25})"',
+            r',"id":"(\d{15,25})","uniqueId":"' + re.escape(handle) + r'"',
+            r'"uniqueId":"' + re.escape(handle) + r'","id":"(\d{15,25})"',
+        ]
+        for pattern in id_patterns:
+            m = re.search(pattern, html, re.IGNORECASE)
+            if m:
+                user_id = m.group(1)
+                logging.info(f"✅ Resolved TikTok user ID: {user_id}")
+                break
+
+        # ── secUid ─────────────────────────────────────────────────────
+        sec_patterns = [
+            r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"',
+            r'"authorSecId"\s*:\s*"([A-Za-z0-9_\-]{20,})"',
+        ]
+        for pattern in sec_patterns:
+            m = re.search(pattern, html, re.IGNORECASE)
+            if m:
+                sec_uid = m.group(1)
+                logging.info(f"✅ Resolved TikTok secUid: {sec_uid[:30]}...")
+                break
+
+        if not user_id and not sec_uid:
+            # Window search fallback
+            handle_pos = html.find(f'"uniqueId":"{handle}"')
+            if handle_pos != -1:
+                window = html[max(0, handle_pos - 300): handle_pos + 300]
+                m = re.search(r'"id"\s*:\s*"(\d{15,25})"', window)
+                if m:
+                    user_id = m.group(1)
+                    logging.info(f"✅ Resolved TikTok user ID (window): {user_id}")
+                m = re.search(r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"', window)
+                if m:
+                    sec_uid = m.group(1)
+                    logging.info(f"✅ Resolved TikTok secUid (window): {sec_uid[:30]}...")
+
+        if not user_id and not sec_uid:
+            logging.warning(
+                f"⚠️ Could not resolve any TikTok ID for @{handle}. "
+                f"HTML length: {len(html)} chars."
+            )
+
    except Exception as e:
-        logging.warning(f"⚠️ Could not resolve TikTok user ID: {e}")
-    return None
-
+        logging.warning(f"⚠️ Could not resolve TikTok IDs: {e}")

+    return user_id, sec_uid
 def scrape_tiktoks_via_playwright(handle: str) -> list:
    """
    Scrape recent videos from a public TikTok profile.