a81a450e7e
Merged into tinqs/studio: - cmd/tinqs-cli/ — tinqs-cli (Go binary, from bot/cli) - cmd/tea/ — Gitea CLI tool (from tinqs/cli-tea) - services/bot/ — Bot service (from tinqs-ltd/bot on git.arikigame.com) - services/admin/ — Admin panel (from tinqs/admin) - services/team-tool/ — Team Tool (from tinqs/team-tool) - services/proxy/ — tinqs-proxy (from bot/proxy) - web/landing/ — tinqs.com website (from tinqs/website) - web/docs/ — Platform docs (from tinqs/docs) - web/blog/ — Blog (placeholder) - runner/ — Ephemeral CI runner (from tinqs/runner) All source repos will be deleted after verification.
354 lines
13 KiB
TypeScript
354 lines
13 KiB
TypeScript
/**
|
||
* Vision pre-processing for OpenAI-compat inference requests.
|
||
*
|
||
* DeepSeek's chat models are text-only. To let Cursor users send screenshots
|
||
* through tinqsProxy, we walk each request's `messages[].content` array, find
|
||
* `image_url` parts, run them through Amazon Nova Pro (the same vision model
|
||
* `/api/v1/hub/describe` uses), and replace each image part with a
|
||
* `[image: <description>]` text part before forwarding to DeepSeek.
|
||
*
|
||
* Descriptions are cached in Redis by sha256(image bytes) — same image across
|
||
* turns hits cache, no second Nova call. 30-day TTL matches the reasoning
|
||
* cache.
|
||
*
|
||
* Failure modes never block the request:
|
||
* - Bytes too big / wrong mime → replace with "[image: unavailable — <reason>]"
|
||
* - Nova call fails → same treatment, logged
|
||
* - AWS creds missing → same treatment, logged once per request
|
||
*
|
||
* Anthropic-compat path is not touched — DeepSeek's `/anthropic/messages`
|
||
* doesn't accept images today and we'd want to test that surface separately.
|
||
*/
|
||
import crypto from "node:crypto";
|
||
import {
|
||
BedrockRuntimeClient,
|
||
InvokeModelCommand,
|
||
} from "@aws-sdk/client-bedrock-runtime";
|
||
import { redis } from "./db";
|
||
import { estimateCost } from "./inference";
|
||
|
||
const NOVA_MODEL = "eu.amazon.nova-pro-v1:0";
|
||
const REGION = process.env.AWS_REGION ?? "eu-west-1";
|
||
const CACHE_TTL_SECONDS = 30 * 86400;
|
||
const MAX_IMAGE_BYTES = 10 * 1024 * 1024; // 10 MB
|
||
const MAX_CONCURRENCY = 4;
|
||
const FETCH_TIMEOUT_MS = 8000;
|
||
|
||
const SYSTEM_PROMPT = `You produce factual image descriptions for a downstream text-only LLM.
|
||
|
||
RULES:
|
||
- Plain English. No HTML.
|
||
- Use relative-position descriptors (top-left, center, right side, bottom half, upper-right corner, left sidebar, bottom toolbar, etc.) to locate elements spatially. Do NOT use pixel coordinates.
|
||
- Preserve readable text VERBATIM in quotes — code, error messages, URLs, file names, UI labels, numbers.
|
||
- 3–8 sentences for simple screens, up to 15 for dense ones.
|
||
- Describe what the image SHOWS, not what the user should do.
|
||
- If the image is a screenshot of code, include language, file name (if visible), and the visible code structure (functions, classes, variables).
|
||
- If the image is a UI, list visible interactive elements, their labels, and their approximate location (e.g. "top-left corner: hamburger menu icon", "center: login form with email and password fields", "bottom bar: Home / Search / Profile tabs").
|
||
- If the image is a photo or diagram, describe subjects, layout, color palette, and any captions.
|
||
- End with one sentence summarizing the single most important fact a downstream LLM needs to answer questions about this image.`;
|
||
|
||
const DEFAULT_USER_PROMPT = "Describe this image for a downstream text-only LLM.";
|
||
|
||
// ──────────────────────────────────────────────────────────────────────────
|
||
// Types
|
||
// ──────────────────────────────────────────────────────────────────────────
|
||
|
||
type TextPart = { type: "text"; text: string };
|
||
type ImageUrlPart = { type: "image_url"; image_url: { url: string; detail?: string } };
|
||
type ContentPart = TextPart | ImageUrlPart | { type: string; [k: string]: unknown };
|
||
|
||
interface MessageLike {
|
||
role?: string;
|
||
content?: string | ContentPart[];
|
||
[k: string]: unknown;
|
||
}
|
||
|
||
export interface VisionPreprocessStats {
|
||
images: number;
|
||
cache_hits: number;
|
||
failures: number;
|
||
tokens_in: number;
|
||
tokens_out: number;
|
||
cost_usd: number;
|
||
latency_ms: number;
|
||
}
|
||
|
||
export function emptyStats(): VisionPreprocessStats {
|
||
return {
|
||
images: 0,
|
||
cache_hits: 0,
|
||
failures: 0,
|
||
tokens_in: 0,
|
||
tokens_out: 0,
|
||
cost_usd: 0,
|
||
latency_ms: 0,
|
||
};
|
||
}
|
||
|
||
// ──────────────────────────────────────────────────────────────────────────
|
||
// Pure helpers (testable without Redis or Bedrock)
|
||
// ──────────────────────────────────────────────────────────────────────────
|
||
|
||
/** True if any message in this body carries an image_url part. */
|
||
export function hasImageParts(messages: unknown): boolean {
|
||
if (!Array.isArray(messages)) return false;
|
||
for (const m of messages as MessageLike[]) {
|
||
if (!m || !Array.isArray(m.content)) continue;
|
||
for (const p of m.content) {
|
||
if (p && typeof p === "object" && (p as ContentPart).type === "image_url") {
|
||
return true;
|
||
}
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
|
||
/**
|
||
* Decode an `image_url.url` into raw bytes + mime. Supports:
|
||
* - `data:image/<mime>;base64,<...>`
|
||
* - `https://...` (fetched, capped at MAX_IMAGE_BYTES)
|
||
*
|
||
* Returns null with a reason string if the URL can't be resolved.
|
||
*/
|
||
export async function resolveImageBytes(
|
||
url: string,
|
||
): Promise<{ bytes: Buffer; mime: string } | { error: string }> {
|
||
if (typeof url !== "string" || url.length === 0) {
|
||
return { error: "empty url" };
|
||
}
|
||
|
||
if (url.startsWith("data:")) {
|
||
const match = /^data:(image\/[a-zA-Z0-9.+-]+);base64,(.*)$/s.exec(url);
|
||
if (!match) return { error: "malformed data url" };
|
||
const [, mime, b64] = match;
|
||
let bytes: Buffer;
|
||
try {
|
||
bytes = Buffer.from(b64, "base64");
|
||
} catch {
|
||
return { error: "invalid base64" };
|
||
}
|
||
if (bytes.byteLength === 0) return { error: "empty image" };
|
||
if (bytes.byteLength > MAX_IMAGE_BYTES) {
|
||
return { error: `image too large (${bytes.byteLength} bytes, max ${MAX_IMAGE_BYTES})` };
|
||
}
|
||
return { bytes, mime };
|
||
}
|
||
|
||
if (url.startsWith("https://") || url.startsWith("http://")) {
|
||
const controller = new AbortController();
|
||
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
|
||
try {
|
||
const res = await fetch(url, { signal: controller.signal });
|
||
if (!res.ok) return { error: `fetch ${res.status}` };
|
||
const mime = (res.headers.get("content-type") || "").split(";")[0].trim();
|
||
if (!mime.startsWith("image/")) {
|
||
return { error: `non-image content-type: ${mime || "unknown"}` };
|
||
}
|
||
const buf = Buffer.from(await res.arrayBuffer());
|
||
if (buf.byteLength === 0) return { error: "empty image" };
|
||
if (buf.byteLength > MAX_IMAGE_BYTES) {
|
||
return { error: `image too large (${buf.byteLength} bytes, max ${MAX_IMAGE_BYTES})` };
|
||
}
|
||
return { bytes: buf, mime };
|
||
} catch (e) {
|
||
const reason = e instanceof Error ? e.message : String(e);
|
||
return { error: `fetch failed: ${reason}` };
|
||
} finally {
|
||
clearTimeout(timer);
|
||
}
|
||
}
|
||
|
||
return { error: "unsupported url scheme" };
|
||
}
|
||
|
||
/** Map a mime type to the format string Bedrock Nova expects. */
|
||
export function novaFormatForMime(mime: string): "png" | "jpeg" | "gif" | "webp" | null {
|
||
const lower = mime.toLowerCase();
|
||
if (lower === "image/png") return "png";
|
||
if (lower === "image/jpeg" || lower === "image/jpg") return "jpeg";
|
||
if (lower === "image/gif") return "gif";
|
||
if (lower === "image/webp") return "webp";
|
||
return null;
|
||
}
|
||
|
||
export function sha256Hex(bytes: Buffer): string {
|
||
return crypto.createHash("sha256").update(bytes).digest("hex");
|
||
}
|
||
|
||
function cacheKey(hash: string): string {
|
||
return `inference:vision:${hash}`;
|
||
}
|
||
|
||
// ──────────────────────────────────────────────────────────────────────────
|
||
// Bedrock call
|
||
// ──────────────────────────────────────────────────────────────────────────
|
||
|
||
let bedrockClient: BedrockRuntimeClient | null = null;
|
||
function getBedrock(): BedrockRuntimeClient {
|
||
if (!bedrockClient) bedrockClient = new BedrockRuntimeClient({ region: REGION });
|
||
return bedrockClient;
|
||
}
|
||
|
||
async function describeWithNova(
|
||
bytes: Buffer,
|
||
format: "png" | "jpeg" | "gif" | "webp",
|
||
userPrompt: string,
|
||
): Promise<{ text: string; tokensIn: number; tokensOut: number }> {
|
||
const body = JSON.stringify({
|
||
system: [{ text: SYSTEM_PROMPT }],
|
||
messages: [
|
||
{
|
||
role: "user",
|
||
content: [
|
||
{ image: { format, source: { bytes: bytes.toString("base64") } } },
|
||
{ text: userPrompt },
|
||
],
|
||
},
|
||
],
|
||
inferenceConfig: { maxTokens: 1500, temperature: 0.2 },
|
||
});
|
||
|
||
const response = await getBedrock().send(
|
||
new InvokeModelCommand({
|
||
modelId: NOVA_MODEL,
|
||
contentType: "application/json",
|
||
accept: "application/json",
|
||
body: new TextEncoder().encode(body),
|
||
}),
|
||
);
|
||
|
||
const parsed = JSON.parse(new TextDecoder().decode(response.body));
|
||
const text: string = parsed?.output?.message?.content?.[0]?.text || "(no description)";
|
||
const tokensIn: number = parsed?.usage?.inputTokens || 0;
|
||
const tokensOut: number = parsed?.usage?.outputTokens || 0;
|
||
return { text, tokensIn, tokensOut };
|
||
}
|
||
|
||
// ──────────────────────────────────────────────────────────────────────────
|
||
// Main entry point
|
||
// ──────────────────────────────────────────────────────────────────────────
|
||
|
||
/**
|
||
* Mutates `body.messages` in place, replacing every `image_url` part with a
|
||
* `text` part containing the Nova description. Always returns; never throws.
|
||
* Failed images are replaced with an "unavailable" text part so the request
|
||
* still goes through to DeepSeek.
|
||
*/
|
||
export async function preprocessImages(
|
||
body: Record<string, unknown>,
|
||
opts: { userPromptHint?: string } = {},
|
||
): Promise<VisionPreprocessStats> {
|
||
const stats = emptyStats();
|
||
if (!Array.isArray(body.messages)) return stats;
|
||
if (!hasImageParts(body.messages)) return stats;
|
||
|
||
const started = Date.now();
|
||
const messages = body.messages as MessageLike[];
|
||
const userPrompt = opts.userPromptHint || DEFAULT_USER_PROMPT;
|
||
|
||
// Collect every image part with a pointer to its location for in-place replacement.
|
||
type Job = {
|
||
msgIdx: number;
|
||
partIdx: number;
|
||
url: string;
|
||
};
|
||
const jobs: Job[] = [];
|
||
for (let i = 0; i < messages.length; i++) {
|
||
const content = messages[i]?.content;
|
||
if (!Array.isArray(content)) continue;
|
||
for (let j = 0; j < content.length; j++) {
|
||
const part = content[j] as ContentPart;
|
||
if (part && part.type === "image_url") {
|
||
const url = (part as ImageUrlPart).image_url?.url || "";
|
||
jobs.push({ msgIdx: i, partIdx: j, url });
|
||
}
|
||
}
|
||
}
|
||
|
||
stats.images = jobs.length;
|
||
if (jobs.length === 0) return stats;
|
||
|
||
// Bounded-concurrency worker pool.
|
||
const results = new Array<TextPart>(jobs.length);
|
||
let next = 0;
|
||
const workers = Array.from({ length: Math.min(MAX_CONCURRENCY, jobs.length) }, async () => {
|
||
while (true) {
|
||
const i = next++;
|
||
if (i >= jobs.length) return;
|
||
results[i] = await processOne(jobs[i], userPrompt, stats);
|
||
}
|
||
});
|
||
await Promise.all(workers);
|
||
|
||
// Apply replacements (back-to-front per message would also work, but we have
|
||
// exact indices and we never insert — one part in, one part out).
|
||
for (let k = 0; k < jobs.length; k++) {
|
||
const { msgIdx, partIdx } = jobs[k];
|
||
const content = messages[msgIdx].content as ContentPart[];
|
||
content[partIdx] = results[k];
|
||
}
|
||
|
||
stats.latency_ms = Date.now() - started;
|
||
return stats;
|
||
}
|
||
|
||
async function processOne(
|
||
job: { url: string },
|
||
userPrompt: string,
|
||
stats: VisionPreprocessStats,
|
||
): Promise<TextPart> {
|
||
const resolved = await resolveImageBytes(job.url);
|
||
if ("error" in resolved) {
|
||
stats.failures++;
|
||
return unavailable(resolved.error);
|
||
}
|
||
|
||
const format = novaFormatForMime(resolved.mime);
|
||
if (!format) {
|
||
stats.failures++;
|
||
return unavailable(`unsupported mime: ${resolved.mime}`);
|
||
}
|
||
|
||
const hash = sha256Hex(resolved.bytes);
|
||
const key = cacheKey(hash);
|
||
|
||
// Cache lookup. Treat any Redis error as a miss.
|
||
if (redis) {
|
||
try {
|
||
const cached = await redis.get(key);
|
||
if (cached) {
|
||
stats.cache_hits++;
|
||
return { type: "text", text: `[image: ${cached}]` };
|
||
}
|
||
} catch {
|
||
// fall through to live call
|
||
}
|
||
}
|
||
|
||
let description: string;
|
||
try {
|
||
const res = await describeWithNova(resolved.bytes, format, userPrompt);
|
||
description = res.text.trim();
|
||
stats.tokens_in += res.tokensIn;
|
||
stats.tokens_out += res.tokensOut;
|
||
stats.cost_usd += estimateCost(NOVA_MODEL, res.tokensIn, res.tokensOut);
|
||
} catch (e) {
|
||
stats.failures++;
|
||
const reason = e instanceof Error ? e.message : String(e);
|
||
console.error("[vision-preprocess] Nova call failed:", reason);
|
||
return unavailable("vision preprocessing failed");
|
||
}
|
||
|
||
if (redis) {
|
||
redis.set(key, description, "EX", CACHE_TTL_SECONDS).catch(() => {
|
||
/* cache write failures must not affect the response */
|
||
});
|
||
}
|
||
|
||
return { type: "text", text: `[image: ${description}]` };
|
||
}
|
||
|
||
function unavailable(reason: string): TextPart {
|
||
return { type: "text", text: `[image: unavailable — ${reason}]` };
|
||
}
|