Added scraped
This commit is contained in:
118
tiktok2bsky.py
118
tiktok2bsky.py
@@ -774,73 +774,61 @@ def _try_refresh_grid(page, max_attempts: int = 4) -> bool:
|
|||||||
|
|
||||||
def _scrape_via_api(handle: str, cookies: list) -> list:
|
def _scrape_via_api(handle: str, cookies: list) -> list:
|
||||||
"""
|
"""
|
||||||
Fallback: hit TikTok's internal item_list API directly with httpx.
|
Fallback scraper using yt-dlp to extract the video list from a
|
||||||
|
TikTok profile. yt-dlp handles TikTok's signing tokens internally.
|
||||||
Returns same list-of-dicts format as the Playwright scraper.
|
Returns same list-of-dicts format as the Playwright scraper.
|
||||||
"""
|
"""
|
||||||
logging.info(f"🌐 Trying TikTok API fallback for @{handle}...")
|
logging.info(f"📦 Trying yt-dlp profile scrape fallback for @{handle}...")
|
||||||
|
|
||||||
cookie_header = "; ".join(
|
cookie_file = None
|
||||||
f"{c.get('name', '')}={c.get('value', '')}"
|
videos = []
|
||||||
for c in cookies
|
|
||||||
if c.get("name") and c.get("value")
|
|
||||||
)
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"User-Agent": (
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
||||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
||||||
"Chrome/124.0.0.0 Safari/537.36"
|
|
||||||
),
|
|
||||||
"Referer": f"https://www.tiktok.com/@{handle}",
|
|
||||||
"Cookie": cookie_header,
|
|
||||||
"Accept": "application/json, text/plain, */*",
|
|
||||||
"Accept-Language": "es-ES,es;q=0.9",
|
|
||||||
}
|
|
||||||
|
|
||||||
user_id, sec_uid = _resolve_tiktok_ids(handle, headers)
|
|
||||||
|
|
||||||
if not user_id and not sec_uid:
|
|
||||||
logging.warning("⚠️ Could not resolve TikTok user ID or secUid for API fallback.")
|
|
||||||
return []
|
|
||||||
|
|
||||||
videos = []
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
params = {
|
import yt_dlp
|
||||||
"aid": "1988",
|
|
||||||
"app_name": "tiktok_web",
|
cookie_file = _write_netscape_cookies(cookies)
|
||||||
"count": str(SCRAPE_VIDEO_LIMIT),
|
|
||||||
"cursor": "0",
|
ydl_opts = {
|
||||||
"secUid": sec_uid or "",
|
"quiet": True,
|
||||||
"id": user_id or "",
|
"no_warnings": False,
|
||||||
"type": "1",
|
"extract_flat": True, # metadata only — no download
|
||||||
"sourceType": "8",
|
"playlistend": SCRAPE_VIDEO_LIMIT,
|
||||||
"appId": "1233",
|
"ignoreerrors": True,
|
||||||
"region": "ES",
|
|
||||||
"priority_region": "ES",
|
|
||||||
"language": "es",
|
|
||||||
}
|
}
|
||||||
resp = httpx.get(
|
if cookie_file:
|
||||||
"https://www.tiktok.com/api/post/item_list/",
|
ydl_opts["cookiefile"] = cookie_file
|
||||||
params=params,
|
|
||||||
headers=headers,
|
|
||||||
timeout=20,
|
|
||||||
follow_redirects=True,
|
|
||||||
)
|
|
||||||
resp.raise_for_status()
|
|
||||||
data = resp.json()
|
|
||||||
|
|
||||||
logging.info(
|
profile_url = f"https://www.tiktok.com/@{handle}"
|
||||||
f"🌐 API response keys: {list(data.keys())}, "
|
|
||||||
f"statusCode: {data.get('statusCode')}, "
|
|
||||||
f"items: {len(data.get('itemList', []))}"
|
|
||||||
)
|
|
||||||
|
|
||||||
for item in data.get("itemList", []):
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||||
|
info = ydl.extract_info(profile_url, download=False)
|
||||||
|
|
||||||
|
if not info:
|
||||||
|
logging.warning("⚠️ yt-dlp returned no info for profile.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
entries = info.get("entries") or []
|
||||||
|
logging.info(f"✅ yt-dlp profile scrape returned {len(entries)} entries.")
|
||||||
|
|
||||||
|
for entry in entries[:SCRAPE_VIDEO_LIMIT]:
|
||||||
try:
|
try:
|
||||||
vid_id = item.get("id", "")
|
if not entry:
|
||||||
desc = item.get("desc", "")
|
continue
|
||||||
url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
|
|
||||||
|
vid_id = str(entry.get("id") or "")
|
||||||
|
url = entry.get("url") or entry.get("webpage_url") or ""
|
||||||
|
desc = entry.get("title") or entry.get("description") or ""
|
||||||
|
|
||||||
|
# Normalise URL
|
||||||
|
if vid_id and not url:
|
||||||
|
url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
|
||||||
|
if not vid_id:
|
||||||
|
m = re.search(r"/video/(\d+)", url)
|
||||||
|
if m:
|
||||||
|
vid_id = m.group(1)
|
||||||
|
if not vid_id:
|
||||||
|
continue
|
||||||
|
|
||||||
videos.append({
|
videos.append({
|
||||||
"id": vid_id,
|
"id": vid_id,
|
||||||
"url": url,
|
"url": url,
|
||||||
@@ -848,17 +836,21 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
|
|||||||
"timestamp": arrow.utcnow().isoformat(),
|
"timestamp": arrow.utcnow().isoformat(),
|
||||||
"video_url": url,
|
"video_url": url,
|
||||||
})
|
})
|
||||||
except Exception as e:
|
|
||||||
logging.warning(f"⚠️ API item parse error: {e}")
|
|
||||||
|
|
||||||
logging.info(f"✅ API fallback returned {len(videos)} videos.")
|
except Exception as e:
|
||||||
|
logging.warning(f"⚠️ yt-dlp entry parse error: {e}")
|
||||||
|
|
||||||
|
logging.info(f"✅ yt-dlp fallback produced {len(videos)} videos.")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"⚠️ TikTok API fallback failed: {e}")
|
logging.error(f"❌ yt-dlp profile scrape failed: {e}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
if cookie_file and os.path.exists(cookie_file):
|
||||||
|
os.unlink(cookie_file)
|
||||||
|
|
||||||
return videos
|
return videos
|
||||||
|
|
||||||
|
|
||||||
def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]:
|
def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]:
|
||||||
"""
|
"""
|
||||||
Extract both the numeric user ID and secUid from the profile page HTML.
|
Extract both the numeric user ID and secUid from the profile page HTML.
|
||||||
|
|||||||
Reference in New Issue
Block a user