Added scraped

This commit is contained in:
Guillem Hernandez Sola
2026-05-19 11:51:14 +02:00
parent a5ee10bd8b
commit 3c3dd159e1

View File

@@ -774,73 +774,61 @@ def _try_refresh_grid(page, max_attempts: int = 4) -> bool:
def _scrape_via_api(handle: str, cookies: list) -> list:
"""
Fallback: hit TikTok's internal item_list API directly with httpx.
Fallback scraper using yt-dlp to extract the video list from a
TikTok profile. yt-dlp handles TikTok's signing tokens internally.
Returns same list-of-dicts format as the Playwright scraper.
"""
logging.info(f"🌐 Trying TikTok API fallback for @{handle}...")
cookie_header = "; ".join(
f"{c.get('name', '')}={c.get('value', '')}"
for c in cookies
if c.get("name") and c.get("value")
)
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
"Referer": f"https://www.tiktok.com/@{handle}",
"Cookie": cookie_header,
"Accept": "application/json, text/plain, */*",
"Accept-Language": "es-ES,es;q=0.9",
}
user_id, sec_uid = _resolve_tiktok_ids(handle, headers)
if not user_id and not sec_uid:
logging.warning("⚠️ Could not resolve TikTok user ID or secUid for API fallback.")
return []
logging.info(f"📦 Trying yt-dlp profile scrape fallback for @{handle}...")
cookie_file = None
videos = []
try:
params = {
"aid": "1988",
"app_name": "tiktok_web",
"count": str(SCRAPE_VIDEO_LIMIT),
"cursor": "0",
"secUid": sec_uid or "",
"id": user_id or "",
"type": "1",
"sourceType": "8",
"appId": "1233",
"region": "ES",
"priority_region": "ES",
"language": "es",
import yt_dlp
cookie_file = _write_netscape_cookies(cookies)
ydl_opts = {
"quiet": True,
"no_warnings": False,
"extract_flat": True, # metadata only — no download
"playlistend": SCRAPE_VIDEO_LIMIT,
"ignoreerrors": True,
}
resp = httpx.get(
"https://www.tiktok.com/api/post/item_list/",
params=params,
headers=headers,
timeout=20,
follow_redirects=True,
)
resp.raise_for_status()
data = resp.json()
if cookie_file:
ydl_opts["cookiefile"] = cookie_file
logging.info(
f"🌐 API response keys: {list(data.keys())}, "
f"statusCode: {data.get('statusCode')}, "
f"items: {len(data.get('itemList', []))}"
)
profile_url = f"https://www.tiktok.com/@{handle}"
for item in data.get("itemList", []):
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(profile_url, download=False)
if not info:
logging.warning("⚠️ yt-dlp returned no info for profile.")
return []
entries = info.get("entries") or []
logging.info(f"✅ yt-dlp profile scrape returned {len(entries)} entries.")
for entry in entries[:SCRAPE_VIDEO_LIMIT]:
try:
vid_id = item.get("id", "")
desc = item.get("desc", "")
if not entry:
continue
vid_id = str(entry.get("id") or "")
url = entry.get("url") or entry.get("webpage_url") or ""
desc = entry.get("title") or entry.get("description") or ""
# Normalise URL
if vid_id and not url:
url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
if not vid_id:
m = re.search(r"/video/(\d+)", url)
if m:
vid_id = m.group(1)
if not vid_id:
continue
videos.append({
"id": vid_id,
"url": url,
@@ -848,17 +836,21 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
"timestamp": arrow.utcnow().isoformat(),
"video_url": url,
})
except Exception as e:
logging.warning(f"⚠️ API item parse error: {e}")
logging.info(f"✅ API fallback returned {len(videos)} videos.")
except Exception as e:
logging.warning(f"⚠️ TikTok API fallback failed: {e}")
logging.warning(f"⚠️ yt-dlp entry parse error: {e}")
logging.info(f"✅ yt-dlp fallback produced {len(videos)} videos.")
except Exception as e:
logging.error(f"❌ yt-dlp profile scrape failed: {e}")
finally:
if cookie_file and os.path.exists(cookie_file):
os.unlink(cookie_file)
return videos
def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]:
"""
Extract both the numeric user ID and secUid from the profile page HTML.