Added all

This commit is contained in:
Guillem Hernandez Sola
2026-05-19 11:45:52 +02:00
parent 97ff3cdb4a
commit a5ee10bd8b

View File

@@ -772,7 +772,6 @@ def _try_refresh_grid(page, max_attempts: int = 4) -> bool:
pass
return False
def _scrape_via_api(handle: str, cookies: list) -> list:
"""
Fallback: hit TikTok's internal item_list API directly with httpx.
@@ -780,7 +779,6 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
"""
logging.info(f"🌐 Trying TikTok API fallback for @{handle}...")
# Build a cookie header string from the injected cookies
cookie_header = "; ".join(
f"{c.get('name', '')}={c.get('value', '')}"
for c in cookies
@@ -793,35 +791,34 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
"Referer": f"https://www.tiktok.com/@{handle}",
"Cookie": cookie_header,
"Accept": "application/json, text/plain, */*",
"Referer": f"https://www.tiktok.com/@{handle}",
"Cookie": cookie_header,
"Accept": "application/json, text/plain, */*",
"Accept-Language": "es-ES,es;q=0.9",
}
# Resolve the numeric user ID from the profile page HTML
user_id = _resolve_tiktok_user_id(handle, headers)
if not user_id:
logging.warning("⚠️ Could not resolve TikTok user ID for API fallback.")
user_id, sec_uid = _resolve_tiktok_ids(handle, headers)
if not user_id and not sec_uid:
logging.warning("⚠️ Could not resolve TikTok user ID or secUid for API fallback.")
return []
videos = []
cursor = 0
try:
params = {
"aid": "1988",
"app_name": "tiktok_web",
"count": str(SCRAPE_VIDEO_LIMIT),
"cursor": str(cursor),
"secUid": "",
"id": user_id,
"type": "1",
"sourceType": "8",
"appId": "1233",
"region": "ES",
"aid": "1988",
"app_name": "tiktok_web",
"count": str(SCRAPE_VIDEO_LIMIT),
"cursor": "0",
"secUid": sec_uid or "",
"id": user_id or "",
"type": "1",
"sourceType": "8",
"appId": "1233",
"region": "ES",
"priority_region": "ES",
"language": "es",
"language": "es",
}
resp = httpx.get(
"https://www.tiktok.com/api/post/item_list/",
@@ -833,11 +830,17 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
resp.raise_for_status()
data = resp.json()
logging.info(
f"🌐 API response keys: {list(data.keys())}, "
f"statusCode: {data.get('statusCode')}, "
f"items: {len(data.get('itemList', []))}"
)
for item in data.get("itemList", []):
try:
vid_id = item.get("id", "")
desc = item.get("desc", "")
url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
vid_id = item.get("id", "")
desc = item.get("desc", "")
url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
videos.append({
"id": vid_id,
"url": url,
@@ -856,8 +859,14 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
return videos
def _resolve_tiktok_user_id(handle: str, headers: dict) -> str | None:
"""Extract the numeric TikTok user ID from the profile page HTML."""
def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]:
"""
Extract both the numeric user ID and secUid from the profile page HTML.
Returns (user_id, sec_uid) — either may be None.
"""
user_id = None
sec_uid = None
try:
resp = httpx.get(
f"https://www.tiktok.com/@{handle}",
@@ -865,21 +874,61 @@ def _resolve_tiktok_user_id(handle: str, headers: dict) -> str | None:
timeout=15,
follow_redirects=True,
)
# TikTok embeds user data in a __UNIVERSAL_DATA_FOR_REHYDRATION__ script tag
match = re.search(r'"uniqueId"\s*:\s*"[^"]+"\s*,\s*"id"\s*:\s*"(\d+)"', resp.text)
if match:
uid = match.group(1)
logging.info(f"✅ Resolved TikTok user ID: {uid}")
return uid
# Fallback pattern
match = re.search(r'"authorId"\s*:\s*"(\d+)"', resp.text)
if match:
return match.group(1)
html = resp.text
# ── Numeric user ID ────────────────────────────────────────────
id_patterns = [
r'"authorId"\s*:\s*"(\d{15,25})"',
r'"author"\s*:\s*\{[^}]*"id"\s*:\s*"(\d{15,25})"',
r'"userId"\s*:\s*"(\d{15,25})"',
r'"uid"\s*:\s*"(\d{15,25})"',
r'"ownerUid"\s*:\s*"(\d{15,25})"',
r',"id":"(\d{15,25})","uniqueId":"' + re.escape(handle) + r'"',
r'"uniqueId":"' + re.escape(handle) + r'","id":"(\d{15,25})"',
]
for pattern in id_patterns:
m = re.search(pattern, html, re.IGNORECASE)
if m:
user_id = m.group(1)
logging.info(f"✅ Resolved TikTok user ID: {user_id}")
break
# ── secUid ─────────────────────────────────────────────────────
sec_patterns = [
r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"',
r'"authorSecId"\s*:\s*"([A-Za-z0-9_\-]{20,})"',
]
for pattern in sec_patterns:
m = re.search(pattern, html, re.IGNORECASE)
if m:
sec_uid = m.group(1)
logging.info(f"✅ Resolved TikTok secUid: {sec_uid[:30]}...")
break
if not user_id and not sec_uid:
# Window search fallback
handle_pos = html.find(f'"uniqueId":"{handle}"')
if handle_pos != -1:
window = html[max(0, handle_pos - 300): handle_pos + 300]
m = re.search(r'"id"\s*:\s*"(\d{15,25})"', window)
if m:
user_id = m.group(1)
logging.info(f"✅ Resolved TikTok user ID (window): {user_id}")
m = re.search(r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"', window)
if m:
sec_uid = m.group(1)
logging.info(f"✅ Resolved TikTok secUid (window): {sec_uid[:30]}...")
if not user_id and not sec_uid:
logging.warning(
f"⚠️ Could not resolve any TikTok ID for @{handle}. "
f"HTML length: {len(html)} chars."
)
except Exception as e:
logging.warning(f"⚠️ Could not resolve TikTok user ID: {e}")
return None
logging.warning(f"⚠️ Could not resolve TikTok IDs: {e}")
return user_id, sec_uid
def scrape_tiktoks_via_playwright(handle: str) -> list:
"""
Scrape recent videos from a public TikTok profile.