Some fixes

This commit is contained in:
Guillem Hernandez Sola
2026-05-19 12:06:14 +02:00
parent 8c13f0c355
commit bb6f0e0139
2 changed files with 57 additions and 23 deletions

View File

@@ -37,19 +37,25 @@ pipeline {
sh ''' sh '''
set -euxo pipefail set -euxo pipefail
# Create venv # ── Playwright system dependencies (required in CI) ─
# Installs libglib, libnss, libatk, libdrm, etc.
# Safe to run even if already installed — exits 0.
"${VENV_DIR}/bin/python" -m playwright install-deps chromium || \
sudo playwright install-deps chromium || \
echo "⚠️ playwright install-deps skipped (no sudo) — continuing"
# ── Create venv ────────────────────────────────────
python3 -m venv "${VENV_DIR}" python3 -m venv "${VENV_DIR}"
# Upgrade pip toolchain # ── Upgrade pip toolchain ──────────────────────────
"${VENV_DIR}/bin/python" -m pip install --upgrade pip wheel setuptools "${VENV_DIR}/bin/python" -m pip install --upgrade pip wheel setuptools
# Install all required packages # ── Install all required packages ──────────────────
"${VENV_DIR}/bin/pip" install \ "${VENV_DIR}/bin/pip" install \
--cache-dir "${PIP_CACHE_DIR}" \ --cache-dir "${PIP_CACHE_DIR}" \
-U \ -U \
atproto \ atproto \
playwright \ playwright \
yt-dlp \
httpx \ httpx \
arrow \ arrow \
python-dotenv \ python-dotenv \
@@ -59,8 +65,20 @@ pipeline {
Pillow \ Pillow \
grapheme grapheme
# ── Install playwright-stealth and detect version ── # ── yt-dlp: always upgrade to latest ──────────────
"${VENV_DIR}/bin/pip" install --cache-dir "${PIP_CACHE_DIR}" -U playwright-stealth # TikTok extractor breaks frequently — latest is required
"${VENV_DIR}/bin/pip" install \
--cache-dir "${PIP_CACHE_DIR}" \
--upgrade \
"yt-dlp"
# Print installed yt-dlp version for traceability
"${VENV_DIR}/bin/pip" show yt-dlp | grep -E "^(Name|Version)"
# ── playwright-stealth ─────────────────────────────
"${VENV_DIR}/bin/pip" install \
--cache-dir "${PIP_CACHE_DIR}" \
-U playwright-stealth
# Print which version was installed for traceability # Print which version was installed for traceability
"${VENV_DIR}/bin/pip" show playwright-stealth | grep -E "^(Name|Version)" "${VENV_DIR}/bin/pip" show playwright-stealth | grep -E "^(Name|Version)"
@@ -87,7 +105,7 @@ except ImportError:
ffmpeg -version | head -1 ffmpeg -version | head -1
ffprobe -version | head -1 ffprobe -version | head -1
# ── Playwright browser binaries (no sudo needed) ─── # ── Playwright browser binaries ────────────────────
"${VENV_DIR}/bin/python" -m playwright install chromium "${VENV_DIR}/bin/python" -m playwright install chromium
''' '''
} }
@@ -141,7 +159,7 @@ except ImportError:
post { post {
always { always {
// Archive logs, state, and any CAPTCHA/debug screenshots // Archive logs, state, and any debug screenshots
archiveArtifacts( archiveArtifacts(
artifacts: '*.log, *.json, screenshot_*.png', artifacts: '*.log, *.json, screenshot_*.png',
allowEmptyArchive: true allowEmptyArchive: true

View File

@@ -774,11 +774,11 @@ def _try_refresh_grid(page, max_attempts: int = 4) -> bool:
def _scrape_via_api(handle: str, cookies: list) -> list: def _scrape_via_api(handle: str, cookies: list) -> list:
""" """
Fallback scraper using yt-dlp to extract the video list from a Fallback scraper using yt-dlp to list videos from a TikTok profile.
TikTok profile. yt-dlp handles TikTok's signing tokens internally. yt-dlp handles TikTok's request signing internally — no raw API needed.
Returns same list-of-dicts format as the Playwright scraper. Returns same list-of-dicts format as the Playwright scraper.
""" """
logging.info(f"📦 Trying yt-dlp profile scrape fallback for @{handle}...") logging.info(f"📦 yt-dlp profile scrape fallback for @{handle}...")
cookie_file = None cookie_file = None
videos = [] videos = []
@@ -791,7 +791,7 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
ydl_opts = { ydl_opts = {
"quiet": True, "quiet": True,
"no_warnings": False, "no_warnings": False,
"extract_flat": True, # metadata only — no download "extract_flat": True, # metadata only — no video download yet
"playlistend": SCRAPE_VIDEO_LIMIT, "playlistend": SCRAPE_VIDEO_LIMIT,
"ignoreerrors": True, "ignoreerrors": True,
} }
@@ -799,6 +799,7 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
ydl_opts["cookiefile"] = cookie_file ydl_opts["cookiefile"] = cookie_file
profile_url = f"https://www.tiktok.com/@{handle}" profile_url = f"https://www.tiktok.com/@{handle}"
logging.info(f"🌐 yt-dlp extracting: {profile_url}")
with yt_dlp.YoutubeDL(ydl_opts) as ydl: with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(profile_url, download=False) info = ydl.extract_info(profile_url, download=False)
@@ -808,7 +809,10 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
return [] return []
entries = info.get("entries") or [] entries = info.get("entries") or []
logging.info(f"✅ yt-dlp profile scrape returned {len(entries)} entries.") logging.info(
f"✅ yt-dlp returned {len(entries)} entries "
f"(playlist: {info.get('title', '?')})"
)
for entry in entries[:SCRAPE_VIDEO_LIMIT]: for entry in entries[:SCRAPE_VIDEO_LIMIT]:
try: try:
@@ -816,17 +820,29 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
continue continue
vid_id = str(entry.get("id") or "") vid_id = str(entry.get("id") or "")
url = entry.get("url") or entry.get("webpage_url") or "" url = (
desc = entry.get("title") or entry.get("description") or "" entry.get("webpage_url")
or entry.get("url")
or ""
)
desc = (
entry.get("title")
or entry.get("description")
or ""
)
# Normalise URL # Normalise URL
if vid_id and not url: if vid_id and not url:
url = f"https://www.tiktok.com/@{handle}/video/{vid_id}" url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
if not vid_id:
# Extract ID from URL if missing
if not vid_id and url:
m = re.search(r"/video/(\d+)", url) m = re.search(r"/video/(\d+)", url)
if m: if m:
vid_id = m.group(1) vid_id = m.group(1)
if not vid_id: if not vid_id:
logging.debug(f"⏭️ Skipping entry with no ID: {entry}")
continue continue
videos.append({ videos.append({
@@ -836,11 +852,12 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
"timestamp": arrow.utcnow().isoformat(), "timestamp": arrow.utcnow().isoformat(),
"video_url": url, "video_url": url,
}) })
logging.debug(f" 📹 {vid_id}: {desc[:60]}")
except Exception as e: except Exception as e:
logging.warning(f"⚠️ yt-dlp entry parse error: {e}") logging.warning(f"⚠️ yt-dlp entry parse error: {e}")
logging.info(f"✅ yt-dlp fallback produced {len(videos)} videos.") logging.info(f"✅ yt-dlp fallback produced {len(videos)} usable videos.")
except Exception as e: except Exception as e:
logging.error(f"❌ yt-dlp profile scrape failed: {e}") logging.error(f"❌ yt-dlp profile scrape failed: {e}")
@@ -850,7 +867,6 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
os.unlink(cookie_file) os.unlink(cookie_file)
return videos return videos
def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]: def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]:
""" """
Extract both the numeric user ID and secUid from the profile page HTML. Extract both the numeric user ID and secUid from the profile page HTML.