Added scraped

2026-05-19 11:51:14 +02:00
parent a5ee10bd8b
commit 3c3dd159e1
1 changed files with 55 additions and 63 deletions
--- a/tiktok2bsky.py
+++ b/tiktok2bsky.py
@@ -774,73 +774,61 @@ def _try_refresh_grid(page, max_attempts: int = 4) -> bool:

 def _scrape_via_api(handle: str, cookies: list) -> list:
    """
-    Fallback: hit TikTok's internal item_list API directly with httpx.
+    Fallback scraper using yt-dlp to extract the video list from a
+    TikTok profile. yt-dlp handles TikTok's signing tokens internally.
    Returns same list-of-dicts format as the Playwright scraper.
    """
-    logging.info(f"🌐 Trying TikTok API fallback for @{handle}...")
-
-    cookie_header = "; ".join(
-        f"{c.get('name', '')}={c.get('value', '')}"
-        for c in cookies
-        if c.get("name") and c.get("value")
-    )
-
-    headers = {
-        "User-Agent": (
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-            "AppleWebKit/537.36 (KHTML, like Gecko) "
-            "Chrome/124.0.0.0 Safari/537.36"
-        ),
-        "Referer":         f"https://www.tiktok.com/@{handle}",
-        "Cookie":          cookie_header,
-        "Accept":          "application/json, text/plain, */*",
-        "Accept-Language": "es-ES,es;q=0.9",
-    }
-
-    user_id, sec_uid = _resolve_tiktok_ids(handle, headers)
-
-    if not user_id and not sec_uid:
-        logging.warning("⚠️ Could not resolve TikTok user ID or secUid for API fallback.")
-        return []
+    logging.info(f"📦 Trying yt-dlp profile scrape fallback for @{handle}...")

+    cookie_file = None
    videos      = []

    try:
-        params = {
-            "aid":             "1988",
-            "app_name":        "tiktok_web",
-            "count":           str(SCRAPE_VIDEO_LIMIT),
-            "cursor":          "0",
-            "secUid":          sec_uid or "",
-            "id":              user_id or "",
-            "type":            "1",
-            "sourceType":      "8",
-            "appId":           "1233",
-            "region":          "ES",
-            "priority_region": "ES",
-            "language":        "es",
+        import yt_dlp
+
+        cookie_file = _write_netscape_cookies(cookies)
+
+        ydl_opts = {
+            "quiet":          True,
+            "no_warnings":    False,
+            "extract_flat":   True,       # metadata only — no download
+            "playlistend":    SCRAPE_VIDEO_LIMIT,
+            "ignoreerrors":   True,
        }
-        resp = httpx.get(
-            "https://www.tiktok.com/api/post/item_list/",
-            params=params,
-            headers=headers,
-            timeout=20,
-            follow_redirects=True,
-        )
-        resp.raise_for_status()
-        data = resp.json()
+        if cookie_file:
+            ydl_opts["cookiefile"] = cookie_file

-        logging.info(
-            f"🌐 API response keys: {list(data.keys())}, "
-            f"statusCode: {data.get('statusCode')}, "
-            f"items: {len(data.get('itemList', []))}"
-        )
+        profile_url = f"https://www.tiktok.com/@{handle}"

-        for item in data.get("itemList", []):
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(profile_url, download=False)
+
+        if not info:
+            logging.warning("⚠️ yt-dlp returned no info for profile.")
+            return []
+
+        entries = info.get("entries") or []
+        logging.info(f"✅ yt-dlp profile scrape returned {len(entries)} entries.")
+
+        for entry in entries[:SCRAPE_VIDEO_LIMIT]:
            try:
-                vid_id = item.get("id", "")
-                desc   = item.get("desc", "")
+                if not entry:
+                    continue
+
+                vid_id  = str(entry.get("id") or "")
+                url     = entry.get("url") or entry.get("webpage_url") or ""
+                desc    = entry.get("title") or entry.get("description") or ""
+
+                # Normalise URL
+                if vid_id and not url:
                    url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
+                if not vid_id:
+                    m = re.search(r"/video/(\d+)", url)
+                    if m:
+                        vid_id = m.group(1)
+                if not vid_id:
+                    continue
+
                videos.append({
                    "id":        vid_id,
                    "url":       url,
@@ -848,17 +836,21 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
                    "timestamp": arrow.utcnow().isoformat(),
                    "video_url": url,
                })
-            except Exception as e:
-                logging.warning(f"⚠️ API item parse error: {e}")
-
-        logging.info(f"✅ API fallback returned {len(videos)} videos.")

            except Exception as e:
-        logging.warning(f"⚠️ TikTok API fallback failed: {e}")
+                logging.warning(f"⚠️ yt-dlp entry parse error: {e}")
+
+        logging.info(f"✅ yt-dlp fallback produced {len(videos)} videos.")
+
+    except Exception as e:
+        logging.error(f"❌ yt-dlp profile scrape failed: {e}")
+
+    finally:
+        if cookie_file and os.path.exists(cookie_file):
+            os.unlink(cookie_file)

    return videos

-
 def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]:
    """
    Extract both the numeric user ID and secUid from the profile page HTML.