diff --git a/.github/workflows/skin-designer-devices-update.yml b/.github/workflows/skin-designer-devices-update.yml index 6c79c30aaa..d5e3f28a04 100644 --- a/.github/workflows/skin-designer-devices-update.yml +++ b/.github/workflows/skin-designer-devices-update.yml @@ -1,22 +1,13 @@ name: Skin Designer Device DB Update -# Runs only on master so it doesn't fire on every PR that touches the script. -# Each run pulls a small batch of the latest GSMArena entries and merges them -# into the bundled devices.json. Slow-and-steady avoids the 30+ minute scrapes -# that risk IP bans and CI cancellation. +# Walks GSMArena's curated brand listings once a week and merges any new +# phones/tablets into the bundled devices.json. The HTML cache is persisted +# across runs so re-fetches are limited to pages that actually changed. on: schedule: - # Every 6 hours. Each run only scrapes the "latest mobiles" page, so we - # accumulate fresh devices without ever doing a multi-thousand-page crawl. - - cron: '0 */6 * * *' + # Mondays at 03:00 UTC. Weekly is plenty — phones don't release that fast. + - cron: '0 3 * * 1' workflow_dispatch: - inputs: - mode: - description: "trickle (~50 latest) or full (all curated brands)" - required: false - default: "trickle" - type: choice - options: [trickle, full] permissions: actions: write @@ -28,7 +19,9 @@ jobs: # Only run on the canonical repo's master, never on forks or feature branches. if: github.repository == 'codenameone/CodenameOne' && github.ref == 'refs/heads/master' runs-on: ubuntu-latest - timeout-minutes: 25 + # Cold runs (empty cache) can take 30–60 min to walk the curated brands; + # warm runs are typically a few minutes. + timeout-minutes: 75 steps: - name: Check out repository @@ -47,17 +40,10 @@ jobs: restore-keys: | skin-designer-devicedb-cache- - - name: Trickle scrape (~50 latest) - if: github.event_name == 'schedule' || github.event.inputs.mode == 'trickle' || github.event.inputs.mode == '' + - name: Scrape curated brands run: | python3 scripts/skindesigner/tools/devicedb/build_devices_json.py \ - --mode latest --limit 50 --delay 2.0 --max-pages 1 - - - name: Full scrape (all curated brands) - if: github.event_name == 'workflow_dispatch' && github.event.inputs.mode == 'full' - run: | - python3 scripts/skindesigner/tools/devicedb/build_devices_json.py \ - --mode brands --delay 2.0 --max-pages 12 + --delay 2.0 --max-pages 12 - name: Bail out if nothing changed id: diff @@ -79,8 +65,8 @@ jobs: body: | ## Summary - Regenerates `scripts/skindesigner/common/src/main/resources/devices.json` - by scraping a small batch of the latest devices from GSMArena and - merging into the existing catalog. + by walking the curated GSMArena brand pages and merging new + entries into the existing catalog. Created automatically by the Skin Designer device DB workflow. base: master diff --git a/scripts/skindesigner/tools/devicedb/README.md b/scripts/skindesigner/tools/devicedb/README.md index d38bb46620..05637c0a36 100644 --- a/scripts/skindesigner/tools/devicedb/README.md +++ b/scripts/skindesigner/tools/devicedb/README.md @@ -82,11 +82,11 @@ curated list misses a brand someone needs): ## Refreshing in CI -`.github/workflows/skin-designer-devices-update.yml` runs the scraper on the -1st of each month and opens an automated PR if the JSON drifted. The HTML -cache is persisted across runs via `actions/cache` so the scrape only -re-fetches phones whose pages changed. Run it manually via the Actions tab -when you want a fresh dump. +`.github/workflows/skin-designer-devices-update.yml` runs the scraper every +Monday at 03:00 UTC and opens an automated PR only when device records +actually change. The HTML cache is persisted across runs via `actions/cache` +so scrapes after the first cold run only re-fetch phones whose pages +changed. Trigger it manually via the Actions tab when you want a fresh dump. ## Notes / caveats diff --git a/scripts/skindesigner/tools/devicedb/build_devices_json.py b/scripts/skindesigner/tools/devicedb/build_devices_json.py index cc225dd515..fa28fa7329 100755 --- a/scripts/skindesigner/tools/devicedb/build_devices_json.py +++ b/scripts/skindesigner/tools/devicedb/build_devices_json.py @@ -331,6 +331,20 @@ def merge(existing: list[dict], fresh: list[dict]) -> list[dict]: return list(by_id.values()) +def _devices_changed(existing: list[dict], merged: list[dict]) -> bool: + """True iff the device records (ignoring order) actually differ.""" + if len(existing) != len(merged): + return True + old_by_id = {r["id"]: r for r in existing} + new_by_id = {r["id"]: r for r in merged} + if old_by_id.keys() != new_by_id.keys(): + return True + for k, old in old_by_id.items(): + if old != new_by_id[k]: + return True + return False + + def load_existing(path: str) -> list[dict]: if not os.path.exists(path): return [] @@ -342,62 +356,15 @@ def load_existing(path: str) -> list[dict]: return [] -RE_LATEST_LINK = re.compile( - r']*>\s*]*alt="([^"]+)"[^>]*>\s*', - re.I, -) - - -def walk_latest(*, max_pages: int = 1, delay: float = 1.0) -> Iterable[tuple[str, str, str]]: - """Yields (phone_url, brand, model) from latest-mobiles.php3. - - The "latest" page lists newly-added devices across every brand. It's the - cheapest source of fresh data — one or two listing pages cover the last - few weeks of releases. - """ - seen_pages: set[str] = set() - queue: list[str] = ["latest-mobiles.php3"] - pages_fetched = 0 - while queue and pages_fetched < max_pages: - page = queue.pop(0) - if page in seen_pages: - continue - seen_pages.add(page) - pages_fetched += 1 - try: - html = http_get(BASE + page, delay=delay) - except RuntimeError as err: - sys.stderr.write(f" ! latest page {page} failed: {err}\n") - continue - for m in RE_LATEST_LINK.finditer(html): - phone_url = m.group(1) - if phone_url.endswith("-review.php"): - continue - alt = unescape(m.group(2)).strip() - # alt is " "; split on first space to recover. - parts = alt.split(" ", 1) - brand = parts[0] if parts else "Unknown" - model = parts[1] if len(parts) > 1 else alt - yield phone_url, brand, model - # Pagination: latest-mobiles-pN.php - for m in RE_NAV_PAGE.finditer(html): - np = m.group(1) - if np.startswith("latest-mobiles") and np not in seen_pages: - queue.append(np) - - def main() -> int: ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) - ap.add_argument("--mode", choices=("brands", "latest"), default="brands", - help="brands = walk per-brand listings (large/slow); " - "latest = scrape recent additions only (fast/trickle)") ap.add_argument("--brands", default=",".join(DEFAULT_BRANDS), help="Comma-separated brand slugs (no .php). Use --full to walk all brands instead.") ap.add_argument("--full", action="store_true", - help="Discover and walk every brand on makers.php3 (only with --mode brands)") + help="Discover and walk every brand on makers.php3") ap.add_argument("--max-pages", type=int, default=10, - help="Cap on listing pages walked per brand (or per latest run)") + help="Cap on listing pages walked per brand") ap.add_argument("--min-year", type=int, default=2014, help="Drop devices announced before this year") ap.add_argument("--delay", type=float, default=1.0, @@ -415,11 +382,25 @@ def main() -> int: seen: set[str] = set() total_phones = 0 - if args.mode == "latest": - sys.stderr.write(f"Trickle-scraping latest-mobiles " - f"(max-pages={args.max_pages}, limit={args.limit or 'none'}, " - f"delay={args.delay}s)…\n") - for url, brand, model in walk_latest(max_pages=args.max_pages, delay=args.delay): + if args.full: + sys.stderr.write("Discovering all brands…\n") + all_brands = discover_brands() + target = [(s, l) for s, l in all_brands if s.endswith(tuple(f"-{i}" for i in range(1000)))] + if not target: + target = all_brands + else: + slugs = [b.strip() for b in args.brands.split(",") if b.strip()] + try: + label_map = dict(discover_brands()) + except Exception: + label_map = {} + target = [(s, brand_label_from_slug(s, label_map)) for s in slugs] + + sys.stderr.write(f"Scraping {len(target)} brand(s) at {args.delay}s/request…\n") + + for slug, label in target: + sys.stderr.write(f"\n=== {label} [{slug}] ===\n") + for url, model in walk_brand(slug, max_pages=args.max_pages, delay=args.delay): if url in seen: continue seen.add(url) @@ -429,60 +410,33 @@ def main() -> int: sys.stderr.write(f" ! skip {url}: {err}\n") continue specs = parse_phone_page(html) - rec = normalise(brand, model, specs) + rec = normalise(label, model, specs) if rec is not None and rec["year"] >= args.min_year: fresh.append(rec) total_phones += 1 if args.limit and total_phones >= args.limit: sys.stderr.write(f" · hit --limit {args.limit}, stopping\n") break - else: - if args.full: - sys.stderr.write("Discovering all brands…\n") - all_brands = discover_brands() - target = [(s, l) for s, l in all_brands if s.endswith(tuple(f"-{i}" for i in range(1000)))] - if not target: - target = all_brands - else: - slugs = [b.strip() for b in args.brands.split(",") if b.strip()] - try: - label_map = dict(discover_brands()) - except Exception: - label_map = {} - target = [(s, brand_label_from_slug(s, label_map)) for s in slugs] - - sys.stderr.write(f"Scraping {len(target)} brand(s) at {args.delay}s/request…\n") - - for slug, label in target: - sys.stderr.write(f"\n=== {label} [{slug}] ===\n") - for url, model in walk_brand(slug, max_pages=args.max_pages, delay=args.delay): - if url in seen: - continue - seen.add(url) - try: - html = http_get(BASE + url, delay=args.delay, use_cache=use_cache) - except RuntimeError as err: - sys.stderr.write(f" ! skip {url}: {err}\n") - continue - specs = parse_phone_page(html) - rec = normalise(label, model, specs) - if rec is not None and rec["year"] >= args.min_year: - fresh.append(rec) - total_phones += 1 - if args.limit and total_phones >= args.limit: - sys.stderr.write(f" · hit --limit {args.limit}, stopping\n") - break - if args.limit and total_phones >= args.limit: - break + if args.limit and total_phones >= args.limit: + break merged = merge(existing, fresh) merged.sort(key=lambda r: (-(r["year"] or 0), r["brand"].lower(), r["name"].lower())) + # Don't rewrite the file when no device record actually changed. The + # envelope (version/generator/source) shifting is not a reason to spam + # PRs; only real device-data changes warrant a commit. + if not _devices_changed(existing, merged): + sys.stderr.write( + f"\nNo device records changed ({len(fresh)} scraped, " + f"{len(merged)} total); leaving {args.out} untouched.\n" + ) + return 0 + payload = { "version": 2, "generator": "build_devices_json.py", "source": BASE, - "fresh_count": len(fresh), "count": len(merged), "devices": merged, }