Added all
This commit is contained in:
@@ -772,7 +772,6 @@ def _try_refresh_grid(page, max_attempts: int = 4) -> bool:
|
|||||||
pass
|
pass
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _scrape_via_api(handle: str, cookies: list) -> list:
|
def _scrape_via_api(handle: str, cookies: list) -> list:
|
||||||
"""
|
"""
|
||||||
Fallback: hit TikTok's internal item_list API directly with httpx.
|
Fallback: hit TikTok's internal item_list API directly with httpx.
|
||||||
@@ -780,7 +779,6 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
|
|||||||
"""
|
"""
|
||||||
logging.info(f"🌐 Trying TikTok API fallback for @{handle}...")
|
logging.info(f"🌐 Trying TikTok API fallback for @{handle}...")
|
||||||
|
|
||||||
# Build a cookie header string from the injected cookies
|
|
||||||
cookie_header = "; ".join(
|
cookie_header = "; ".join(
|
||||||
f"{c.get('name', '')}={c.get('value', '')}"
|
f"{c.get('name', '')}={c.get('value', '')}"
|
||||||
for c in cookies
|
for c in cookies
|
||||||
@@ -799,23 +797,22 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
|
|||||||
"Accept-Language": "es-ES,es;q=0.9",
|
"Accept-Language": "es-ES,es;q=0.9",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Resolve the numeric user ID from the profile page HTML
|
user_id, sec_uid = _resolve_tiktok_ids(handle, headers)
|
||||||
user_id = _resolve_tiktok_user_id(handle, headers)
|
|
||||||
if not user_id:
|
if not user_id and not sec_uid:
|
||||||
logging.warning("⚠️ Could not resolve TikTok user ID for API fallback.")
|
logging.warning("⚠️ Could not resolve TikTok user ID or secUid for API fallback.")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
videos = []
|
videos = []
|
||||||
cursor = 0
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
params = {
|
params = {
|
||||||
"aid": "1988",
|
"aid": "1988",
|
||||||
"app_name": "tiktok_web",
|
"app_name": "tiktok_web",
|
||||||
"count": str(SCRAPE_VIDEO_LIMIT),
|
"count": str(SCRAPE_VIDEO_LIMIT),
|
||||||
"cursor": str(cursor),
|
"cursor": "0",
|
||||||
"secUid": "",
|
"secUid": sec_uid or "",
|
||||||
"id": user_id,
|
"id": user_id or "",
|
||||||
"type": "1",
|
"type": "1",
|
||||||
"sourceType": "8",
|
"sourceType": "8",
|
||||||
"appId": "1233",
|
"appId": "1233",
|
||||||
@@ -833,6 +830,12 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
|
|||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
|
|
||||||
|
logging.info(
|
||||||
|
f"🌐 API response keys: {list(data.keys())}, "
|
||||||
|
f"statusCode: {data.get('statusCode')}, "
|
||||||
|
f"items: {len(data.get('itemList', []))}"
|
||||||
|
)
|
||||||
|
|
||||||
for item in data.get("itemList", []):
|
for item in data.get("itemList", []):
|
||||||
try:
|
try:
|
||||||
vid_id = item.get("id", "")
|
vid_id = item.get("id", "")
|
||||||
@@ -856,8 +859,14 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
|
|||||||
return videos
|
return videos
|
||||||
|
|
||||||
|
|
||||||
def _resolve_tiktok_user_id(handle: str, headers: dict) -> str | None:
|
def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]:
|
||||||
"""Extract the numeric TikTok user ID from the profile page HTML."""
|
"""
|
||||||
|
Extract both the numeric user ID and secUid from the profile page HTML.
|
||||||
|
Returns (user_id, sec_uid) — either may be None.
|
||||||
|
"""
|
||||||
|
user_id = None
|
||||||
|
sec_uid = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
resp = httpx.get(
|
resp = httpx.get(
|
||||||
f"https://www.tiktok.com/@{handle}",
|
f"https://www.tiktok.com/@{handle}",
|
||||||
@@ -865,21 +874,61 @@ def _resolve_tiktok_user_id(handle: str, headers: dict) -> str | None:
|
|||||||
timeout=15,
|
timeout=15,
|
||||||
follow_redirects=True,
|
follow_redirects=True,
|
||||||
)
|
)
|
||||||
# TikTok embeds user data in a __UNIVERSAL_DATA_FOR_REHYDRATION__ script tag
|
html = resp.text
|
||||||
match = re.search(r'"uniqueId"\s*:\s*"[^"]+"\s*,\s*"id"\s*:\s*"(\d+)"', resp.text)
|
|
||||||
if match:
|
# ── Numeric user ID ────────────────────────────────────────────
|
||||||
uid = match.group(1)
|
id_patterns = [
|
||||||
logging.info(f"✅ Resolved TikTok user ID: {uid}")
|
r'"authorId"\s*:\s*"(\d{15,25})"',
|
||||||
return uid
|
r'"author"\s*:\s*\{[^}]*"id"\s*:\s*"(\d{15,25})"',
|
||||||
# Fallback pattern
|
r'"userId"\s*:\s*"(\d{15,25})"',
|
||||||
match = re.search(r'"authorId"\s*:\s*"(\d+)"', resp.text)
|
r'"uid"\s*:\s*"(\d{15,25})"',
|
||||||
if match:
|
r'"ownerUid"\s*:\s*"(\d{15,25})"',
|
||||||
return match.group(1)
|
r',"id":"(\d{15,25})","uniqueId":"' + re.escape(handle) + r'"',
|
||||||
|
r'"uniqueId":"' + re.escape(handle) + r'","id":"(\d{15,25})"',
|
||||||
|
]
|
||||||
|
for pattern in id_patterns:
|
||||||
|
m = re.search(pattern, html, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
user_id = m.group(1)
|
||||||
|
logging.info(f"✅ Resolved TikTok user ID: {user_id}")
|
||||||
|
break
|
||||||
|
|
||||||
|
# ── secUid ─────────────────────────────────────────────────────
|
||||||
|
sec_patterns = [
|
||||||
|
r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"',
|
||||||
|
r'"authorSecId"\s*:\s*"([A-Za-z0-9_\-]{20,})"',
|
||||||
|
]
|
||||||
|
for pattern in sec_patterns:
|
||||||
|
m = re.search(pattern, html, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
sec_uid = m.group(1)
|
||||||
|
logging.info(f"✅ Resolved TikTok secUid: {sec_uid[:30]}...")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not user_id and not sec_uid:
|
||||||
|
# Window search fallback
|
||||||
|
handle_pos = html.find(f'"uniqueId":"{handle}"')
|
||||||
|
if handle_pos != -1:
|
||||||
|
window = html[max(0, handle_pos - 300): handle_pos + 300]
|
||||||
|
m = re.search(r'"id"\s*:\s*"(\d{15,25})"', window)
|
||||||
|
if m:
|
||||||
|
user_id = m.group(1)
|
||||||
|
logging.info(f"✅ Resolved TikTok user ID (window): {user_id}")
|
||||||
|
m = re.search(r'"secUid"\s*:\s*"([A-Za-z0-9_\-]{20,})"', window)
|
||||||
|
if m:
|
||||||
|
sec_uid = m.group(1)
|
||||||
|
logging.info(f"✅ Resolved TikTok secUid (window): {sec_uid[:30]}...")
|
||||||
|
|
||||||
|
if not user_id and not sec_uid:
|
||||||
|
logging.warning(
|
||||||
|
f"⚠️ Could not resolve any TikTok ID for @{handle}. "
|
||||||
|
f"HTML length: {len(html)} chars."
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"⚠️ Could not resolve TikTok user ID: {e}")
|
logging.warning(f"⚠️ Could not resolve TikTok IDs: {e}")
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
|
return user_id, sec_uid
|
||||||
def scrape_tiktoks_via_playwright(handle: str) -> list:
|
def scrape_tiktoks_via_playwright(handle: str) -> list:
|
||||||
"""
|
"""
|
||||||
Scrape recent videos from a public TikTok profile.
|
Scrape recent videos from a public TikTok profile.
|
||||||
|
|||||||
Reference in New Issue
Block a user