Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions src/batch/services/batch-item-filename.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
const MAX_SLUG_LENGTH = 100;

function slugify(value: string): string {
return value
.replace(/[^a-zA-Z0-9.-]+/g, "-")
.replace(/^-+|-+$/g, "")
.slice(0, MAX_SLUG_LENGTH);
}

/**
* Derive a filesystem-safe base name (no extension) for a batch item. URLs are
* slugged from host + path; anything else falls back to the row index.
*/
export function batchItemFilename(input: string, index: number): string {
try {
const url = new URL(input);
const slug = slugify(`${url.hostname}${url.pathname}`);
if (slug.length > 0) {
return slug;
}
} catch {
// Not a URL — fall back to the row index below.
}

return `item-${index}`;
}
20 changes: 20 additions & 0 deletions src/batch/services/batch-record.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import type { BatchResult } from "../types/batch-result.js";

export interface BatchRecord {
error?: { class: string; message: string };
index: number;
input: string;
result?: unknown;
}

/**
* Shape a settled batch result into the record that is streamed to stdout or
* written per item. Successes carry `result`; failures carry `error`.
*/
export function toBatchRecord(result: BatchResult): BatchRecord {
if (result.ok) {
return { index: result.index, input: result.input, result: result.data };
}

return { index: result.index, input: result.input, error: result.error };
}
53 changes: 53 additions & 0 deletions src/batch/services/directory-sink.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import { mkdirSync, writeFileSync } from "node:fs";
import { join } from "node:path";
import type { BatchResult } from "../types/batch-result.js";
import type { BatchSink } from "../types/batch-sink.js";
import { batchItemFilename } from "./batch-item-filename.js";
import { toBatchRecord } from "./batch-record.js";

export interface DirectorySinkOptions {
pretty?: boolean;
}

function uniqueName(base: string, used: Set<string>): string {
if (!used.has(base)) {
used.add(base);
return base;
}

let suffix = 2;
let candidate = `${base}-${suffix}`;
while (used.has(candidate)) {
suffix++;
candidate = `${base}-${suffix}`;
}
used.add(candidate);
return candidate;
}

/**
* Per-item sink: writes one `<name>.json` file per result into `dir` (created
* if needed). File names come from the input URL slug or row index, deduped on
* collision. Each file holds the same record that the stdout sink would emit.
*/
export function createDirectorySink(
dir: string,
options: DirectorySinkOptions = {}
): BatchSink {
mkdirSync(dir, { recursive: true });
const used = new Set<string>();
const indent = options.pretty ? 2 : undefined;

return {
write(result: BatchResult): void {
const base = batchItemFilename(result.input, result.index);
const name = uniqueName(base, used);
const record = toBatchRecord(result);
writeFileSync(
join(dir, `${name}.json`),
JSON.stringify(record, null, indent),
"utf8"
);
},
};
}
14 changes: 14 additions & 0 deletions src/batch/services/ndjson-stdout-sink.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import type { BatchResult } from "../types/batch-result.js";
import type { BatchSink } from "../types/batch-sink.js";
import { toBatchRecord } from "./batch-record.js";

/**
* Batch default sink: one JSON record per line on stdout, regardless of TTY.
*/
export function createNdjsonStdoutSink(): BatchSink {
return {
write(result: BatchResult): void {
process.stdout.write(`${JSON.stringify(toBatchRecord(result))}\n`);
},
};
}
7 changes: 7 additions & 0 deletions src/batch/types/batch-sink.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import type { BatchResult } from "./batch-result.js";

/** Destination for batch results — stdout (ndjson) or a directory of files. */
export interface BatchSink {
close?(): void | Promise<void>;
write(result: BatchResult): void | Promise<void>;
}
19 changes: 19 additions & 0 deletions tests/batch/batch-item-filename.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import { describe, expect, it } from "vitest";
import { batchItemFilename } from "../../src/batch/services/batch-item-filename.js";

describe("batchItemFilename", () => {
it("slugs a URL from host and path, dropping the query", () => {
expect(batchItemFilename("https://example.com/a/b?x=1", 0)).toBe(
"example.com-a-b"
);
});

it("falls back to the row index for non-URL input", () => {
expect(batchItemFilename("how to scrape", 3)).toBe("item-3");
});

it("truncates very long slugs", () => {
const long = `https://example.com/${"a".repeat(300)}`;
expect(batchItemFilename(long, 0).length).toBeLessThanOrEqual(100);
});
});
61 changes: 61 additions & 0 deletions tests/batch/directory-sink.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import { mkdtempSync, readdirSync, readFileSync, rmSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { createDirectorySink } from "../../src/batch/services/directory-sink.js";

let dir: string;

beforeEach(() => {
dir = mkdtempSync(join(tmpdir(), "batch-dir-sink-"));
});

afterEach(() => {
rmSync(dir, { recursive: true, force: true });
});

describe("createDirectorySink", () => {
it("writes one record file per item, named from the input", () => {
const outDir = join(dir, "out");
const sink = createDirectorySink(outDir);

sink.write({ index: 0, input: "https://a.com/x", ok: true, data: "A" });
sink.write({
index: 1,
input: "https://b.com",
ok: false,
error: { class: "TimeoutError", message: "timed out" },
});

const files = readdirSync(outDir).sort();
expect(files).toEqual(["a.com-x.json", "b.com.json"]);

const success = JSON.parse(
readFileSync(join(outDir, "a.com-x.json"), "utf8")
);
expect(success).toEqual({
index: 0,
input: "https://a.com/x",
result: "A",
});

const failure = JSON.parse(
readFileSync(join(outDir, "b.com.json"), "utf8")
);
expect(failure).toEqual({
index: 1,
input: "https://b.com",
error: { class: "TimeoutError", message: "timed out" },
});
});

it("dedupes colliding names with a numeric suffix", () => {
const sink = createDirectorySink(dir);

sink.write({ index: 0, input: "not a url", ok: true, data: 1 });
sink.write({ index: 0, input: "also not a url", ok: true, data: 2 });

const files = readdirSync(dir).sort();
expect(files).toEqual(["item-0-2.json", "item-0.json"]);
});
});
38 changes: 38 additions & 0 deletions tests/batch/ndjson-stdout-sink.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import { afterEach, describe, expect, it, vi } from "vitest";
import { createNdjsonStdoutSink } from "../../src/batch/services/ndjson-stdout-sink.js";

afterEach(() => {
vi.restoreAllMocks();
});

describe("createNdjsonStdoutSink", () => {
it("writes one JSON line per result to stdout", () => {
const lines: string[] = [];
vi.spyOn(process.stdout, "write").mockImplementation((chunk: unknown) => {
lines.push(String(chunk));
return true;
});

const sink = createNdjsonStdoutSink();
sink.write({ index: 0, input: "https://a.com", ok: true, data: { ok: 1 } });
sink.write({
index: 1,
input: "https://b.com",
ok: false,
error: { class: "ValidationError", message: "nope" },
});

expect(lines).toHaveLength(2);
expect(lines[0].endsWith("\n")).toBe(true);
expect(JSON.parse(lines[0])).toEqual({
index: 0,
input: "https://a.com",
result: { ok: 1 },
});
expect(JSON.parse(lines[1])).toEqual({
index: 1,
input: "https://b.com",
error: { class: "ValidationError", message: "nope" },
});
});
});
Loading