Added scraped

2026-05-19 11:51:14 +02:00
parent a5ee10bd8b
commit 3c3dd159e1
1 changed files with 55 additions and 63 deletions
--- a/tiktok2bsky.py
+++ b/tiktok2bsky.py
@@ -774,73 +774,61 @@ def _try_refresh_grid(page, max_attempts: int = 4) -> bool:
 def _scrape_via_api(handle: str, cookies: list) -> list:
    """
-    Fallback: hit TikTok's internal item_list API directly with httpx.
+    Fallback scraper using yt-dlp to extract the video list from a
    TikTok profile. yt-dlp handles TikTok's signing tokens internally.
    Returns same list-of-dicts format as the Playwright scraper.
    """
-    logging.info(f"🌐 Trying TikTok API fallback for @{handle}...")
+    logging.info(f"📦 Trying yt-dlp profile scrape fallback for @{handle}...")
-    cookie_header = "; ".join(
+    cookie_file = None
-        f"{c.get('name', '')}={c.get('value', '')}"
+    videos      = []
        for c in cookies
        if c.get("name") and c.get("value")
    )
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/124.0.0.0 Safari/537.36"
        ),
        "Referer":         f"https://www.tiktok.com/@{handle}",
        "Cookie":          cookie_header,
        "Accept":          "application/json, text/plain, */*",
        "Accept-Language": "es-ES,es;q=0.9",
    }
    user_id, sec_uid = _resolve_tiktok_ids(handle, headers)
    if not user_id and not sec_uid:
        logging.warning("⚠️ Could not resolve TikTok user ID or secUid for API fallback.")
        return []
    videos = []
    try:
-        params = {
+        import yt_dlp
-            "aid":             "1988",
+
-            "app_name":        "tiktok_web",
+        cookie_file = _write_netscape_cookies(cookies)
-            "count":           str(SCRAPE_VIDEO_LIMIT),
+
-            "cursor":          "0",
+        ydl_opts = {
-            "secUid":          sec_uid or "",
+            "quiet":          True,
-            "id":              user_id or "",
+            "no_warnings":    False,
-            "type":            "1",
+            "extract_flat":   True,       # metadata only — no download
-            "sourceType":      "8",
+            "playlistend":    SCRAPE_VIDEO_LIMIT,
-            "appId":           "1233",
+            "ignoreerrors":   True,
            "region":          "ES",
            "priority_region": "ES",
            "language":        "es",
        }
-        resp = httpx.get(
+        if cookie_file:
-            "https://www.tiktok.com/api/post/item_list/",
+            ydl_opts["cookiefile"] = cookie_file
            params=params,
            headers=headers,
            timeout=20,
            follow_redirects=True,
        )
        resp.raise_for_status()
        data = resp.json()
-        logging.info(
+        profile_url = f"https://www.tiktok.com/@{handle}"
            f"🌐 API response keys: {list(data.keys())}, "
            f"statusCode: {data.get('statusCode')}, "
            f"items: {len(data.get('itemList', []))}"
        )
-        for item in data.get("itemList", []):
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(profile_url, download=False)
        if not info:
            logging.warning("⚠️ yt-dlp returned no info for profile.")
            return []
        entries = info.get("entries") or []
        logging.info(f"✅ yt-dlp profile scrape returned {len(entries)} entries.")
        for entry in entries[:SCRAPE_VIDEO_LIMIT]:
            try:
-                vid_id = item.get("id", "")
+                if not entry:
-                desc   = item.get("desc", "")
+                    continue
-                url    = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
+
                vid_id  = str(entry.get("id") or "")
                url     = entry.get("url") or entry.get("webpage_url") or ""
                desc    = entry.get("title") or entry.get("description") or ""
                # Normalise URL
                if vid_id and not url:
                    url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
                if not vid_id:
                    m = re.search(r"/video/(\d+)", url)
                    if m:
                        vid_id = m.group(1)
                if not vid_id:
                    continue
                videos.append({
                    "id":        vid_id,
                    "url":       url,
@@ -848,17 +836,21 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
                    "timestamp": arrow.utcnow().isoformat(),
                    "video_url": url,
                })
            except Exception as e:
                logging.warning(f"⚠️ API item parse error: {e}")
-        logging.info(f"✅ API fallback returned {len(videos)} videos.")
+            except Exception as e:
                logging.warning(f"⚠️ yt-dlp entry parse error: {e}")
        logging.info(f"✅ yt-dlp fallback produced {len(videos)} videos.")
    except Exception as e:
-        logging.warning(f"⚠️ TikTok API fallback failed: {e}")
+        logging.error(f"❌ yt-dlp profile scrape failed: {e}")
    finally:
        if cookie_file and os.path.exists(cookie_file):
            os.unlink(cookie_file)
    return videos
 def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]:
    """
    Extract both the numeric user ID and secUid from the profile page HTML.