Added all
This commit is contained in:
@@ -772,7 +772,6 @@ def _try_refresh_grid(page, max_attempts: int = 4) -> bool:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def _scrape_via_api(handle: str, cookies: list) -> list:
|
||||
"""
|
||||
Fallback: hit TikTok's internal item_list API directly with httpx.
|
||||
@@ -780,7 +779,6 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
|
||||
"""
|
||||
logging.info(f"🌐 Trying TikTok API fallback for @{handle}...")
|
||||
|
||||
# Build a cookie header string from the injected cookies
|
||||
cookie_header = "; ".join(
|
||||
f"{c.get('name', '')}={c.get('value', '')}"
|
||||
for c in cookies
|
||||
@@ -799,23 +797,22 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
|
||||
"Accept-Language": "es-ES,es;q=0.9",
|
||||
}
|
||||
|
||||
# Resolve the numeric user ID from the profile page HTML
|
||||
user_id = _resolve_tiktok_user_id(handle, headers)
|
||||
if not user_id:
|
||||
logging.warning("⚠️ Could not resolve TikTok user ID for API fallback.")
|
||||
user_id, sec_uid = _resolve_tiktok_ids(handle, headers)
|
||||
|
||||
if not user_id and not sec_uid:
|
||||
logging.warning("⚠️ Could not resolve TikTok user ID or secUid for API fallback.")
|
||||
return []
|
||||
|
||||
videos = []
|
||||
cursor = 0
|
||||
|
||||
try:
|
||||
params = {
|
||||
"aid": "1988",
|
||||
"app_name": "tiktok_web",
|
||||
"count": str(SCRAPE_VIDEO_LIMIT),
|
||||
"cursor": str(cursor),
|
||||
"secUid": "",
|
||||
"id": user_id,
|
||||
"cursor": "0",
|
||||
"secUid": sec_uid or "",
|
||||
"id": user_id or "",
|
||||
"type": "1",
|
||||
"sourceType": "8",
|
||||
"appId": "1233",
|
||||
@@ -833,6 +830,12 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
logging.info(
|
||||
f"🌐 API response keys: {list(data.keys())}, "
|
||||
f"statusCode: {data.get('statusCode')}, "
|
||||
f"items: {len(data.get('itemList', []))}"
|
||||
)
|
||||
|
||||
for item in data.get("itemList", []):
|
||||
try:
|
||||
vid_id = item.get("id", "")
|
||||
@@ -856,8 +859,14 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
|
||||
return videos
|
||||
|
||||
|
||||
def _resolve_tiktok_user_id(handle: str, headers: dict) -> str | None:
|
||||
"""Extract the numeric TikTok user ID from the profile page HTML."""
|
||||
def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
Extract both the numeric user ID and secUid from the profile page HTML.
|
||||
Returns (user_id, sec_uid) — either may be None.
|
||||
"""
|
||||
user_id = None
|
||||
sec_uid = None
|
||||
|
||||
try:
|
||||
resp = httpx.get(
|
||||
f"https://www.tiktok.com/@{handle}",
|
||||
@@ -865,21 +874,61 @@ def _resolve_tiktok_user_id(handle: str, headers: dict) -> str | None:
|
||||
timeout=15,
|
||||
follow_redirects=True,
|
||||
)
|
||||
# TikTok embeds user data in a __UNIVERSAL_DATA_FOR_REHYDRATION__ script tag
|
||||
match = re.search(r'"uniqueId"\s*:\s*"[^"]+"\s*,\s*"id"\s*:\s*"(\d+)"', resp.text)
|
||||
if match:
|
||||
uid = match.group(1)
|
||||
logging.info(f"✅ Resolved TikTok user ID: {uid}")
|
||||
return uid
|
||||
# Fallback pattern
|
||||
match = re.search(r'"authorId"\s*:\s*"(\d+)"', resp.text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
html = resp.text
|
||||
|
||||
# ── Numeric user ID ────────────────────────────────────────────
|
||||
id_patterns = [
|
||||
r'"authorId"\s*:\s*"(\d{15,25})"',
|
||||
r'"author"\s*:\s*\{[^}]*"id"\s*:\s*"(\d{15,25})"',
|
||||
r'"userId"\s*:\s*"(\d{15,25})"',
|
||||
r'"uid"\s*:\s*"(\d{15,25})"',
|
||||
r'"ownerUid"\s*:\s*"(\d{15,25})"',
|
||||
r',"id":"(\d{15,25})","uniqueId":"' + re.escape(handle) + r'"',
|
||||
r'"uniqueId":"' + re.escape(handle) + r'","id":"(\d{15,25})"',
|
||||
]
|
||||
for pattern in id_patterns:
|
||||
m = re.search(pattern, html, re.IGNORECASE)
|
||||
if m:
|
||||
user_id = m.group(1)
|
||||
logging.info(f"✅ Resolved TikTok user ID: {user_id}")
|
||||
break
|
||||
|
||||
# ── secUid ─────────────────────────────────────────────────────
|
||||
sec_patterns = [
|
||||
r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"',
|
||||
r'"authorSecId"\s*:\s*"([A-Za-z0-9_\-]{20,})"',
|
||||
]
|
||||
for pattern in sec_patterns:
|
||||
m = re.search(pattern, html, re.IGNORECASE)
|
||||
if m:
|
||||
sec_uid = m.group(1)
|
||||
logging.info(f"✅ Resolved TikTok secUid: {sec_uid[:30]}...")
|
||||
break
|
||||
|
||||
if not user_id and not sec_uid:
|
||||
# Window search fallback
|
||||
handle_pos = html.find(f'"uniqueId":"{handle}"')
|
||||
if handle_pos != -1:
|
||||
window = html[max(0, handle_pos - 300): handle_pos + 300]
|
||||
m = re.search(r'"id"\s*:\s*"(\d{15,25})"', window)
|
||||
if m:
|
||||
user_id = m.group(1)
|
||||
logging.info(f"✅ Resolved TikTok user ID (window): {user_id}")
|
||||
m = re.search(r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"', window)
|
||||
if m:
|
||||
sec_uid = m.group(1)
|
||||
logging.info(f"✅ Resolved TikTok secUid (window): {sec_uid[:30]}...")
|
||||
|
||||
if not user_id and not sec_uid:
|
||||
logging.warning(
|
||||
f"⚠️ Could not resolve any TikTok ID for @{handle}. "
|
||||
f"HTML length: {len(html)} chars."
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"⚠️ Could not resolve TikTok user ID: {e}")
|
||||
return None
|
||||
|
||||
logging.warning(f"⚠️ Could not resolve TikTok IDs: {e}")
|
||||
|
||||
return user_id, sec_uid
|
||||
def scrape_tiktoks_via_playwright(handle: str) -> list:
|
||||
"""
|
||||
Scrape recent videos from a public TikTok profile.
|
||||
|
||||
Reference in New Issue
Block a user