From 3c3dd159e1886fe9ca2486c760c1804bfd0d82f3 Mon Sep 17 00:00:00 2001 From: Guillem Hernandez Sola Date: Tue, 19 May 2026 11:51:14 +0200 Subject: [PATCH] Added scraped --- tiktok2bsky.py | 118 +++++++++++++++++++++++-------------------------- 1 file changed, 55 insertions(+), 63 deletions(-) diff --git a/tiktok2bsky.py b/tiktok2bsky.py index b473ae9..ca6a9c5 100644 --- a/tiktok2bsky.py +++ b/tiktok2bsky.py @@ -774,73 +774,61 @@ def _try_refresh_grid(page, max_attempts: int = 4) -> bool: def _scrape_via_api(handle: str, cookies: list) -> list: """ - Fallback: hit TikTok's internal item_list API directly with httpx. + Fallback scraper using yt-dlp to extract the video list from a + TikTok profile. yt-dlp handles TikTok's signing tokens internally. Returns same list-of-dicts format as the Playwright scraper. """ - logging.info(f"🌐 Trying TikTok API fallback for @{handle}...") + logging.info(f"📦 Trying yt-dlp profile scrape fallback for @{handle}...") - cookie_header = "; ".join( - f"{c.get('name', '')}={c.get('value', '')}" - for c in cookies - if c.get("name") and c.get("value") - ) - - headers = { - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/124.0.0.0 Safari/537.36" - ), - "Referer": f"https://www.tiktok.com/@{handle}", - "Cookie": cookie_header, - "Accept": "application/json, text/plain, */*", - "Accept-Language": "es-ES,es;q=0.9", - } - - user_id, sec_uid = _resolve_tiktok_ids(handle, headers) - - if not user_id and not sec_uid: - logging.warning("⚠️ Could not resolve TikTok user ID or secUid for API fallback.") - return [] - - videos = [] + cookie_file = None + videos = [] try: - params = { - "aid": "1988", - "app_name": "tiktok_web", - "count": str(SCRAPE_VIDEO_LIMIT), - "cursor": "0", - "secUid": sec_uid or "", - "id": user_id or "", - "type": "1", - "sourceType": "8", - "appId": "1233", - "region": "ES", - "priority_region": "ES", - "language": "es", + import yt_dlp + + cookie_file = _write_netscape_cookies(cookies) + + ydl_opts = { + "quiet": True, + "no_warnings": False, + "extract_flat": True, # metadata only — no download + "playlistend": SCRAPE_VIDEO_LIMIT, + "ignoreerrors": True, } - resp = httpx.get( - "https://www.tiktok.com/api/post/item_list/", - params=params, - headers=headers, - timeout=20, - follow_redirects=True, - ) - resp.raise_for_status() - data = resp.json() + if cookie_file: + ydl_opts["cookiefile"] = cookie_file - logging.info( - f"🌐 API response keys: {list(data.keys())}, " - f"statusCode: {data.get('statusCode')}, " - f"items: {len(data.get('itemList', []))}" - ) + profile_url = f"https://www.tiktok.com/@{handle}" - for item in data.get("itemList", []): + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(profile_url, download=False) + + if not info: + logging.warning("⚠️ yt-dlp returned no info for profile.") + return [] + + entries = info.get("entries") or [] + logging.info(f"✅ yt-dlp profile scrape returned {len(entries)} entries.") + + for entry in entries[:SCRAPE_VIDEO_LIMIT]: try: - vid_id = item.get("id", "") - desc = item.get("desc", "") - url = f"https://www.tiktok.com/@{handle}/video/{vid_id}" + if not entry: + continue + + vid_id = str(entry.get("id") or "") + url = entry.get("url") or entry.get("webpage_url") or "" + desc = entry.get("title") or entry.get("description") or "" + + # Normalise URL + if vid_id and not url: + url = f"https://www.tiktok.com/@{handle}/video/{vid_id}" + if not vid_id: + m = re.search(r"/video/(\d+)", url) + if m: + vid_id = m.group(1) + if not vid_id: + continue + videos.append({ "id": vid_id, "url": url, @@ -848,17 +836,21 @@ def _scrape_via_api(handle: str, cookies: list) -> list: "timestamp": arrow.utcnow().isoformat(), "video_url": url, }) - except Exception as e: - logging.warning(f"⚠️ API item parse error: {e}") - logging.info(f"✅ API fallback returned {len(videos)} videos.") + except Exception as e: + logging.warning(f"⚠️ yt-dlp entry parse error: {e}") + + logging.info(f"✅ yt-dlp fallback produced {len(videos)} videos.") except Exception as e: - logging.warning(f"⚠️ TikTok API fallback failed: {e}") + logging.error(f"❌ yt-dlp profile scrape failed: {e}") + + finally: + if cookie_file and os.path.exists(cookie_file): + os.unlink(cookie_file) return videos - def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]: """ Extract both the numeric user ID and secUid from the profile page HTML.