Some fixes
This commit is contained in:
@@ -37,19 +37,25 @@ pipeline {
|
|||||||
sh '''
|
sh '''
|
||||||
set -euxo pipefail
|
set -euxo pipefail
|
||||||
|
|
||||||
# Create venv
|
# ── Playwright system dependencies (required in CI) ─
|
||||||
|
# Installs libglib, libnss, libatk, libdrm, etc.
|
||||||
|
# Safe to run even if already installed — exits 0.
|
||||||
|
"${VENV_DIR}/bin/python" -m playwright install-deps chromium || \
|
||||||
|
sudo playwright install-deps chromium || \
|
||||||
|
echo "⚠️ playwright install-deps skipped (no sudo) — continuing"
|
||||||
|
|
||||||
|
# ── Create venv ────────────────────────────────────
|
||||||
python3 -m venv "${VENV_DIR}"
|
python3 -m venv "${VENV_DIR}"
|
||||||
|
|
||||||
# Upgrade pip toolchain
|
# ── Upgrade pip toolchain ──────────────────────────
|
||||||
"${VENV_DIR}/bin/python" -m pip install --upgrade pip wheel setuptools
|
"${VENV_DIR}/bin/python" -m pip install --upgrade pip wheel setuptools
|
||||||
|
|
||||||
# Install all required packages
|
# ── Install all required packages ──────────────────
|
||||||
"${VENV_DIR}/bin/pip" install \
|
"${VENV_DIR}/bin/pip" install \
|
||||||
--cache-dir "${PIP_CACHE_DIR}" \
|
--cache-dir "${PIP_CACHE_DIR}" \
|
||||||
-U \
|
-U \
|
||||||
atproto \
|
atproto \
|
||||||
playwright \
|
playwright \
|
||||||
yt-dlp \
|
|
||||||
httpx \
|
httpx \
|
||||||
arrow \
|
arrow \
|
||||||
python-dotenv \
|
python-dotenv \
|
||||||
@@ -59,8 +65,20 @@ pipeline {
|
|||||||
Pillow \
|
Pillow \
|
||||||
grapheme
|
grapheme
|
||||||
|
|
||||||
# ── Install playwright-stealth and detect version ──
|
# ── yt-dlp: always upgrade to latest ──────────────
|
||||||
"${VENV_DIR}/bin/pip" install --cache-dir "${PIP_CACHE_DIR}" -U playwright-stealth
|
# TikTok extractor breaks frequently — latest is required
|
||||||
|
"${VENV_DIR}/bin/pip" install \
|
||||||
|
--cache-dir "${PIP_CACHE_DIR}" \
|
||||||
|
--upgrade \
|
||||||
|
"yt-dlp"
|
||||||
|
|
||||||
|
# Print installed yt-dlp version for traceability
|
||||||
|
"${VENV_DIR}/bin/pip" show yt-dlp | grep -E "^(Name|Version)"
|
||||||
|
|
||||||
|
# ── playwright-stealth ─────────────────────────────
|
||||||
|
"${VENV_DIR}/bin/pip" install \
|
||||||
|
--cache-dir "${PIP_CACHE_DIR}" \
|
||||||
|
-U playwright-stealth
|
||||||
|
|
||||||
# Print which version was installed for traceability
|
# Print which version was installed for traceability
|
||||||
"${VENV_DIR}/bin/pip" show playwright-stealth | grep -E "^(Name|Version)"
|
"${VENV_DIR}/bin/pip" show playwright-stealth | grep -E "^(Name|Version)"
|
||||||
@@ -87,7 +105,7 @@ except ImportError:
|
|||||||
ffmpeg -version | head -1
|
ffmpeg -version | head -1
|
||||||
ffprobe -version | head -1
|
ffprobe -version | head -1
|
||||||
|
|
||||||
# ── Playwright browser binaries (no sudo needed) ───
|
# ── Playwright browser binaries ────────────────────
|
||||||
"${VENV_DIR}/bin/python" -m playwright install chromium
|
"${VENV_DIR}/bin/python" -m playwright install chromium
|
||||||
'''
|
'''
|
||||||
}
|
}
|
||||||
@@ -141,7 +159,7 @@ except ImportError:
|
|||||||
post {
|
post {
|
||||||
|
|
||||||
always {
|
always {
|
||||||
// Archive logs, state, and any CAPTCHA/debug screenshots
|
// Archive logs, state, and any debug screenshots
|
||||||
archiveArtifacts(
|
archiveArtifacts(
|
||||||
artifacts: '*.log, *.json, screenshot_*.png',
|
artifacts: '*.log, *.json, screenshot_*.png',
|
||||||
allowEmptyArchive: true
|
allowEmptyArchive: true
|
||||||
|
|||||||
@@ -774,11 +774,11 @@ def _try_refresh_grid(page, max_attempts: int = 4) -> bool:
|
|||||||
|
|
||||||
def _scrape_via_api(handle: str, cookies: list) -> list:
|
def _scrape_via_api(handle: str, cookies: list) -> list:
|
||||||
"""
|
"""
|
||||||
Fallback scraper using yt-dlp to extract the video list from a
|
Fallback scraper using yt-dlp to list videos from a TikTok profile.
|
||||||
TikTok profile. yt-dlp handles TikTok's signing tokens internally.
|
yt-dlp handles TikTok's request signing internally — no raw API needed.
|
||||||
Returns same list-of-dicts format as the Playwright scraper.
|
Returns same list-of-dicts format as the Playwright scraper.
|
||||||
"""
|
"""
|
||||||
logging.info(f"📦 Trying yt-dlp profile scrape fallback for @{handle}...")
|
logging.info(f"📦 yt-dlp profile scrape fallback for @{handle}...")
|
||||||
|
|
||||||
cookie_file = None
|
cookie_file = None
|
||||||
videos = []
|
videos = []
|
||||||
@@ -789,16 +789,17 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
|
|||||||
cookie_file = _write_netscape_cookies(cookies)
|
cookie_file = _write_netscape_cookies(cookies)
|
||||||
|
|
||||||
ydl_opts = {
|
ydl_opts = {
|
||||||
"quiet": True,
|
"quiet": True,
|
||||||
"no_warnings": False,
|
"no_warnings": False,
|
||||||
"extract_flat": True, # metadata only — no download
|
"extract_flat": True, # metadata only — no video download yet
|
||||||
"playlistend": SCRAPE_VIDEO_LIMIT,
|
"playlistend": SCRAPE_VIDEO_LIMIT,
|
||||||
"ignoreerrors": True,
|
"ignoreerrors": True,
|
||||||
}
|
}
|
||||||
if cookie_file:
|
if cookie_file:
|
||||||
ydl_opts["cookiefile"] = cookie_file
|
ydl_opts["cookiefile"] = cookie_file
|
||||||
|
|
||||||
profile_url = f"https://www.tiktok.com/@{handle}"
|
profile_url = f"https://www.tiktok.com/@{handle}"
|
||||||
|
logging.info(f"🌐 yt-dlp extracting: {profile_url}")
|
||||||
|
|
||||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||||
info = ydl.extract_info(profile_url, download=False)
|
info = ydl.extract_info(profile_url, download=False)
|
||||||
@@ -808,25 +809,40 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
entries = info.get("entries") or []
|
entries = info.get("entries") or []
|
||||||
logging.info(f"✅ yt-dlp profile scrape returned {len(entries)} entries.")
|
logging.info(
|
||||||
|
f"✅ yt-dlp returned {len(entries)} entries "
|
||||||
|
f"(playlist: {info.get('title', '?')})"
|
||||||
|
)
|
||||||
|
|
||||||
for entry in entries[:SCRAPE_VIDEO_LIMIT]:
|
for entry in entries[:SCRAPE_VIDEO_LIMIT]:
|
||||||
try:
|
try:
|
||||||
if not entry:
|
if not entry:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
vid_id = str(entry.get("id") or "")
|
vid_id = str(entry.get("id") or "")
|
||||||
url = entry.get("url") or entry.get("webpage_url") or ""
|
url = (
|
||||||
desc = entry.get("title") or entry.get("description") or ""
|
entry.get("webpage_url")
|
||||||
|
or entry.get("url")
|
||||||
|
or ""
|
||||||
|
)
|
||||||
|
desc = (
|
||||||
|
entry.get("title")
|
||||||
|
or entry.get("description")
|
||||||
|
or ""
|
||||||
|
)
|
||||||
|
|
||||||
# Normalise URL
|
# Normalise URL
|
||||||
if vid_id and not url:
|
if vid_id and not url:
|
||||||
url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
|
url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
|
||||||
if not vid_id:
|
|
||||||
|
# Extract ID from URL if missing
|
||||||
|
if not vid_id and url:
|
||||||
m = re.search(r"/video/(\d+)", url)
|
m = re.search(r"/video/(\d+)", url)
|
||||||
if m:
|
if m:
|
||||||
vid_id = m.group(1)
|
vid_id = m.group(1)
|
||||||
|
|
||||||
if not vid_id:
|
if not vid_id:
|
||||||
|
logging.debug(f"⏭️ Skipping entry with no ID: {entry}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
videos.append({
|
videos.append({
|
||||||
@@ -836,11 +852,12 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
|
|||||||
"timestamp": arrow.utcnow().isoformat(),
|
"timestamp": arrow.utcnow().isoformat(),
|
||||||
"video_url": url,
|
"video_url": url,
|
||||||
})
|
})
|
||||||
|
logging.debug(f" 📹 {vid_id}: {desc[:60]}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"⚠️ yt-dlp entry parse error: {e}")
|
logging.warning(f"⚠️ yt-dlp entry parse error: {e}")
|
||||||
|
|
||||||
logging.info(f"✅ yt-dlp fallback produced {len(videos)} videos.")
|
logging.info(f"✅ yt-dlp fallback produced {len(videos)} usable videos.")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"❌ yt-dlp profile scrape failed: {e}")
|
logging.error(f"❌ yt-dlp profile scrape failed: {e}")
|
||||||
@@ -850,7 +867,6 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
|
|||||||
os.unlink(cookie_file)
|
os.unlink(cookie_file)
|
||||||
|
|
||||||
return videos
|
return videos
|
||||||
|
|
||||||
def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]:
|
def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]:
|
||||||
"""
|
"""
|
||||||
Extract both the numeric user ID and secUid from the profile page HTML.
|
Extract both the numeric user ID and secUid from the profile page HTML.
|
||||||
|
|||||||
Reference in New Issue
Block a user