Some fixes

2026-05-19 12:06:14 +02:00
parent 8c13f0c355
commit bb6f0e0139
2 changed files with 57 additions and 23 deletions
--- a/jenkins/jijantesFCTikTok
+++ b/jenkins/jijantesFCTikTok
@@ -37,19 +37,25 @@ pipeline {
                sh '''
                    set -euxo pipefail

-                    # Create venv
+                    # ── Playwright system dependencies (required in CI) ─
+                    # Installs libglib, libnss, libatk, libdrm, etc.
+                    # Safe to run even if already installed — exits 0.
+                    "${VENV_DIR}/bin/python" -m playwright install-deps chromium || \
+                        sudo playwright install-deps chromium || \
+                        echo "⚠️ playwright install-deps skipped (no sudo) — continuing"
+
+                    # ── Create venv ────────────────────────────────────
                    python3 -m venv "${VENV_DIR}"

-                    # Upgrade pip toolchain
+                    # ── Upgrade pip toolchain ──────────────────────────
                    "${VENV_DIR}/bin/python" -m pip install --upgrade pip wheel setuptools

-                    # Install all required packages
+                    # ── Install all required packages ──────────────────
                    "${VENV_DIR}/bin/pip" install \
                        --cache-dir "${PIP_CACHE_DIR}" \
                        -U \
                        atproto \
                        playwright \
-                        yt-dlp \
                        httpx \
                        arrow \
                        python-dotenv \
@@ -59,8 +65,20 @@ pipeline {
                        Pillow \
                        grapheme

-                    # ── Install playwright-stealth and detect version ──
-                    "${VENV_DIR}/bin/pip" install --cache-dir "${PIP_CACHE_DIR}" -U playwright-stealth
+                    # ── yt-dlp: always upgrade to latest ──────────────
+                    # TikTok extractor breaks frequently — latest is required
+                    "${VENV_DIR}/bin/pip" install \
+                        --cache-dir "${PIP_CACHE_DIR}" \
+                        --upgrade \
+                        "yt-dlp"
+
+                    # Print installed yt-dlp version for traceability
+                    "${VENV_DIR}/bin/pip" show yt-dlp | grep -E "^(Name|Version)"
+
+                    # ── playwright-stealth ─────────────────────────────
+                    "${VENV_DIR}/bin/pip" install \
+                        --cache-dir "${PIP_CACHE_DIR}" \
+                        -U playwright-stealth

                    # Print which version was installed for traceability
                    "${VENV_DIR}/bin/pip" show playwright-stealth | grep -E "^(Name|Version)"
@@ -87,7 +105,7 @@ except ImportError:
                    ffmpeg  -version | head -1
                    ffprobe -version | head -1

-                    # ── Playwright browser binaries (no sudo needed) ───
+                    # ── Playwright browser binaries ────────────────────
                    "${VENV_DIR}/bin/python" -m playwright install chromium
                '''
            }
@@ -141,7 +159,7 @@ except ImportError:
    post {

        always {
-            // Archive logs, state, and any CAPTCHA/debug screenshots
+            // Archive logs, state, and any debug screenshots
            archiveArtifacts(
                artifacts:         '*.log, *.json, screenshot_*.png',
                allowEmptyArchive: true
--- a/tiktok2bsky.py
+++ b/tiktok2bsky.py
@@ -774,11 +774,11 @@ def _try_refresh_grid(page, max_attempts: int = 4) -> bool:

 def _scrape_via_api(handle: str, cookies: list) -> list:
    """
-    Fallback scraper using yt-dlp to extract the video list from a
-    TikTok profile. yt-dlp handles TikTok's signing tokens internally.
+    Fallback scraper using yt-dlp to list videos from a TikTok profile.
+    yt-dlp handles TikTok's request signing internally — no raw API needed.
    Returns same list-of-dicts format as the Playwright scraper.
    """
-    logging.info(f"📦 Trying yt-dlp profile scrape fallback for @{handle}...")
+    logging.info(f"📦 yt-dlp profile scrape fallback for @{handle}...")

    cookie_file = None
    videos      = []
@@ -791,7 +791,7 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
        ydl_opts = {
            "quiet":        True,
            "no_warnings":  False,
-            "extract_flat":   True,       # metadata only — no download
+            "extract_flat": True,   # metadata only — no video download yet
            "playlistend":  SCRAPE_VIDEO_LIMIT,
            "ignoreerrors": True,
        }
@@ -799,6 +799,7 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
            ydl_opts["cookiefile"] = cookie_file

        profile_url = f"https://www.tiktok.com/@{handle}"
+        logging.info(f"🌐 yt-dlp extracting: {profile_url}")

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(profile_url, download=False)
@@ -808,7 +809,10 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
            return []

        entries = info.get("entries") or []
-        logging.info(f"✅ yt-dlp profile scrape returned {len(entries)} entries.")
+        logging.info(
+            f"✅ yt-dlp returned {len(entries)} entries "
+            f"(playlist: {info.get('title', '?')})"
+        )

        for entry in entries[:SCRAPE_VIDEO_LIMIT]:
            try:
@@ -816,17 +820,29 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
                    continue

                vid_id = str(entry.get("id") or "")
-                url     = entry.get("url") or entry.get("webpage_url") or ""
-                desc    = entry.get("title") or entry.get("description") or ""
+                url    = (
+                    entry.get("webpage_url")
+                    or entry.get("url")
+                    or ""
+                )
+                desc   = (
+                    entry.get("title")
+                    or entry.get("description")
+                    or ""
+                )

                # Normalise URL
                if vid_id and not url:
                    url = f"https://www.tiktok.com/@{handle}/video/{vid_id}"
-                if not vid_id:
+
+                # Extract ID from URL if missing
+                if not vid_id and url:
                    m = re.search(r"/video/(\d+)", url)
                    if m:
                        vid_id = m.group(1)
+
                if not vid_id:
+                    logging.debug(f"⏭️ Skipping entry with no ID: {entry}")
                    continue

                videos.append({
@@ -836,11 +852,12 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
                    "timestamp": arrow.utcnow().isoformat(),
                    "video_url": url,
                })
+                logging.debug(f"  📹 {vid_id}: {desc[:60]}")

            except Exception as e:
                logging.warning(f"⚠️ yt-dlp entry parse error: {e}")

-        logging.info(f"✅ yt-dlp fallback produced {len(videos)} videos.")
+        logging.info(f"✅ yt-dlp fallback produced {len(videos)} usable videos.")

    except Exception as e:
        logging.error(f"❌ yt-dlp profile scrape failed: {e}")
@@ -850,7 +867,6 @@ def _scrape_via_api(handle: str, cookies: list) -> list:
            os.unlink(cookie_file)

    return videos
-
 def _resolve_tiktok_ids(handle: str, headers: dict) -> tuple[str | None, str | None]:
    """
    Extract both the numeric user ID and secUid from the profile page HTML.