{
  "openapi": "3.1.0",
  "info": {
    "title": "Divepool Embedding API",
    "description": "Semantic search and real-time embedding stream for Bluesky posts. Embeddings are EmbeddingGemma-300m vectors (Matryoshka, L2-normalized): 128d for public access, 768d with bearer token. Contact @divepool.social on Bluesky for a bearer token.\n\nOnly posts with at least 120 characters and a detectable language are embedded (all languages — EmbeddingGemma is multilingual). Shorter posts are excluded from the stream and search results. Cluster topic labels (c-TF-IDF) are English/German only; other languages get clusters without labels.\n\nRate limits (Caddy, per IP): 30 req/min for all /api/v1/* endpoints, 500 req/s global across all endpoints.",
    "version": "0.7.0",
    "termsOfService": "https://divepool.social/terms",
    "contact": {
      "name": "Divepool",
      "url": "https://bsky.app/profile/divepool.social"
    }
  },
  "externalDocs": {
    "description": "Reference Go client with zstd stream decompression and search (source on Tangled)",
    "url": "https://tangled.sh/@divepool.social/embedding_firehose_client"
  },
  "servers": [
    {"url": "https://divepool.social", "description": "Production"}
  ],
  "paths": {
    "/api/v1/search": {
      "post": {
        "operationId": "search",
        "summary": "Semantic search",
        "description": "Search indexed Bluesky posts by text query, account, or example post. Also clusters a set of accounts by their precomputed topical medoids via `dids: [...]`. Works without authentication; bearer token unlocks full 768d embeddings (vs 128d public). At least one of query, did, dids, or rkey is required. query and rkey are mutually exclusive; dids cannot be combined with did/rkey/query. Max 4 concurrent searches server-wide.",
        "security": [{"bearerAuth": []}, {}],
        "parameters": [
          {
            "name": "X-Bluesky-Handle",
            "in": "header",
            "required": false,
            "schema": {"type": "string"},
            "description": "Your Bluesky handle (e.g. you.bsky.social). Totally optional — just helps us understand who's using the API and reach out for feedback. We'd love to hear from you!"
          }
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {"$ref": "#/components/schemas/SearchRequest"},
              "examples": {
                "globalSearch": {
                  "summary": "Global semantic search",
                  "value": {"query": "machine learning", "limit": 100}
                },
                "browseAccount": {
                  "summary": "Browse account (newest posts, clustered)",
                  "value": {"did": "did:plc:abc123"}
                },
                "searchInAccount": {
                  "summary": "Search within an account",
                  "value": {"did": "did:plc:abc123", "query": "machine learning"}
                },
                "similarPosts": {
                  "summary": "Find posts similar to a specific post",
                  "value": {"did": "did:plc:abc123", "rkey": "3abc"}
                },
                "clusterAccounts": {
                  "summary": "Cluster a set of accounts by their medoid posts (e.g. your followers)",
                  "value": {"dids": ["did:plc:abc123", "did:plc:def456", "did:plc:ghi789"]}
                }
              }
            }
          }
        },
        "responses": {
          "200": {
            "description": "Search results",
            "headers": {
              "Cache-Control": {
                "schema": {"type": "string", "example": "private, max-age=30"}
              }
            },
            "content": {
              "application/json": {
                "schema": {"$ref": "#/components/schemas/SearchResponse"}
              }
            }
          },
          "400": {
            "description": "Invalid request — missing/conflicting params, unknown DID, or post not found",
            "content": {
              "text/plain": {
                "schema": {"type": "string"},
                "examples": {
                  "missingParam": {"value": "need at least one of: query, did, dids, rkey"},
                  "exclusive": {"value": "query and rkey are mutually exclusive"},
                  "rkeyNeedsDid": {"value": "rkey requires did"},
                  "didsExclusive": {"value": "dids cannot be combined with did, rkey, or query"},
                  "viewerRequiresDids": {"value": "viewer_did requires dids"},
                  "tooManyDIDs": {"value": "max 500 dids"},
                  "langsExclusive": {"value": "langs and viewer_did are mutually exclusive"},
                  "sinceUntilMode": {"value": "since/until cannot be combined with dids"},
                  "badSince": {"value": "since must be RFC3339"},
                  "badRange": {"value": "since must be before until"},
                  "minScoreMode": {"value": "min_score requires query or rkey"},
                  "badMinScore": {"value": "min_score must be between -1 and 1"},
                  "badJson": {"value": "bad json"},
                  "unknownDid": {"value": "unknown did"},
                  "postNotFound": {"value": "post not found or has no embedding"}
                }
              }
            }
          },
          "401": {
            "description": "Invalid bearer token (omitting the token is fine — returns 128d results)",
            "content": {
              "text/plain": {
                "schema": {"type": "string"},
                "example": "invalid token"
              }
            }
          },
          "429": {
            "description": "Rate limit exceeded (30 req/min per IP, enforced by Caddy)"
          },
          "500": {
            "description": "Internal error (search query or database failure)",
            "content": {
              "text/plain": {
                "schema": {"type": "string"},
                "examples": {
                  "fetchFailed": {"value": "failed to fetch texts"},
                  "gatherFailed": {"value": "failed to gather medoids"},
                  "searchFailed": {"value": "search failed"}
                }
              }
            }
          },
          "503": {
            "description": "Too many concurrent searches or embedding service unavailable",
            "content": {
              "text/plain": {
                "schema": {"type": "string"},
                "examples": {
                  "concurrency": {"value": "too many concurrent searches"},
                  "embedUnavailable": {"value": "embedding service unavailable"}
                }
              }
            }
          }
        }
      }
    },
    "/api/v1/embeddings": {
      "get": {
        "operationId": "streamEmbeddings",
        "summary": "Embedding stream",
        "description": "Real-time zstd-compressed NDJSON stream of embedding batches. Each line is a zstd-compressed JSON batch. Public clients receive 128d vectors; bearer clients receive full 768d. Empty batches (all arrays empty) are heartbeats sent every ~10s. Max 50 concurrent public connections; bearer clients are limited to 1 connection per token.\n\nThe stream primarily carries embeddings for newly created posts as they appear on the Bluesky network. However, embeddings are produced asynchronously after text processing, so the stream may occasionally lag behind real-time during periods of high load (e.g. bulk resyncs of account history). During such periods, the stream may also include embeddings for older posts that are being processed for the first time. Clients should not assume strict chronological ordering or that every embedding corresponds to a recent post.",
        "security": [{"bearerAuth": []}, {}],
        "responses": {
          "200": {
            "description": "Zstd-compressed NDJSON stream. Each line is a zstd-compressed EmbeddingBatch. Clients must decompress with zstd and parse line-by-line as JSON.",
            "headers": {
              "Content-Type": {
                "schema": {"type": "string", "const": "application/zstd"}
              },
              "Cache-Control": {
                "schema": {"type": "string", "example": "no-cache"}
              }
            },
            "content": {
              "application/zstd": {
                "schema": {"$ref": "#/components/schemas/EmbeddingBatch"}
              }
            }
          },
          "401": {
            "description": "Invalid bearer token",
            "content": {
              "text/plain": {
                "schema": {"type": "string"},
                "example": "invalid token"
              }
            }
          },
          "409": {
            "description": "Bearer token already has an active stream connection (close it first)",
            "content": {
              "text/plain": {
                "schema": {"type": "string"},
                "example": "already connected"
              }
            }
          },
          "429": {
            "description": "Rate limit exceeded (30 req/min per IP, enforced by Caddy)"
          },
          "503": {
            "description": "Too many concurrent public connections (max 50)",
            "content": {
              "text/plain": {
                "schema": {"type": "string"},
                "example": "too many connections"
              }
            }
          }
        }
      }
    },
    "/api/v1/medoids": {
      "post": {
        "operationId": "getMedoids",
        "summary": "Account medoids",
        "description": "Fetch top 3 cluster medoids (representative posts) for up to 25 accounts. Each medoid represents a topic cluster within the account, sorted by cluster size descending, labeled with c-TF-IDF topic keywords (en/de only). Each account also carries a single account_embedding (cluster-size-weighted medoid mean) for account-space math. Public: 128d L2-normalized embeddings. Bearer: full 768d.",
        "security": [{"bearerAuth": []}, {}],
        "parameters": [
          {
            "name": "X-Bluesky-Handle",
            "in": "header",
            "required": false,
            "schema": {"type": "string"},
            "description": "Your Bluesky handle (e.g. you.bsky.social). Totally optional — just helps us understand who's using the API and reach out for feedback. We'd love to hear from you!"
          }
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {"$ref": "#/components/schemas/MedoidsRequest"},
              "examples": {
                "basic": {
                  "summary": "Fetch medoids for two accounts",
                  "value": {"dids": ["did:plc:abc123", "did:plc:def456"]}
                },
                "single": {
                  "summary": "Fetch medoids for one account",
                  "value": {"dids": ["did:plc:abc123"]}
                }
              }
            }
          }
        },
        "responses": {
          "200": {
            "description": "Medoids grouped by DID. DIDs not found in the system are silently omitted.",
            "headers": {
              "Cache-Control": {
                "schema": {"type": "string", "example": "private, max-age=300"}
              }
            },
            "content": {
              "application/json": {
                "schema": {"$ref": "#/components/schemas/MedoidsResponse"}
              }
            }
          },
          "400": {
            "description": "Invalid request",
            "content": {
              "text/plain": {
                "schema": {"type": "string"},
                "examples": {
                  "missing": {"value": "dids is required"},
                  "tooMany": {"value": "max 25 dids"},
                  "badJson": {"value": "bad json"}
                }
              }
            }
          },
          "401": {
            "description": "Invalid bearer token (omitting the token is fine — returns 128d results)",
            "content": {
              "text/plain": {
                "schema": {"type": "string"},
                "example": "invalid token"
              }
            }
          },
          "429": {
            "description": "Rate limit exceeded (30 req/min per IP, enforced by Caddy)"
          },
          "500": {
            "description": "Internal error (DID resolution or medoid fetch failure)"
          }
        }
      }
    },
    "/api/v1/similar-accounts": {
      "post": {
        "operationId": "similarAccounts",
        "summary": "Similar accounts",
        "description": "Given a DID, rank the semantically nearest accounts across the whole indexed corpus. Uses up to 6 of the subject's most distinct topic medoids as query vectors and scores each candidate account by cross-topic affinity (accounts matching several of the subject's topics rank above one-topic matches), excluding the subject itself. Collapses the medoids → per-medoid search → dedupe-to-accounts client pattern into one call.\n\nScores are relative: raw similarity lands in a narrow band, so each account also carries `margin` — its score minus the mean over the full internal candidate pool (typically hundreds of accounts, computed before the `limit` cut). Rank by `margin` when combining with other signals (e.g. multiply by follower reach to find amplifier candidates) and treat margin ≤ 0 as noise. Works without authentication; a bearer token only attributes usage (the response carries no embeddings).",
        "security": [{"bearerAuth": []}, {}],
        "parameters": [
          {
            "name": "X-Bluesky-Handle",
            "in": "header",
            "required": false,
            "schema": {"type": "string"},
            "description": "Your Bluesky handle (e.g. you.bsky.social). Totally optional — just helps us understand who's using the API and reach out for feedback. We'd love to hear from you!"
          }
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {"$ref": "#/components/schemas/SimilarAccountsRequest"},
              "examples": {
                "basic": {
                  "summary": "Nearest accounts, auto-scoped to the subject's languages",
                  "value": {"did": "did:plc:abc123", "limit": 30}
                },
                "denoised": {
                  "summary": "Suppress one-off topic clusters on both sides",
                  "value": {"did": "did:plc:abc123", "min_cluster_size": 5}
                },
                "explicitLangs": {
                  "summary": "Only accounts whose matching topic cluster is German",
                  "value": {"did": "did:plc:abc123", "langs": ["de"]}
                },
                "allLangs": {
                  "summary": "Disable the language scope (cross-lingual neighbors)",
                  "value": {"did": "did:plc:abc123", "langs": []}
                }
              }
            }
          }
        },
        "responses": {
          "200": {
            "description": "Accounts ranked by similarity (descending). Empty `accounts` means no matching accounts were found — the subject has no indexed posts, min_cluster_size filtered all its topic clusters, or nothing matched the applied language scope.",
            "headers": {
              "Cache-Control": {
                "schema": {"type": "string", "example": "private, max-age=300"}
              }
            },
            "content": {
              "application/json": {
                "schema": {"$ref": "#/components/schemas/SimilarAccountsResponse"}
              }
            }
          },
          "400": {
            "description": "Invalid request",
            "content": {
              "text/plain": {
                "schema": {"type": "string"},
                "examples": {
                  "missingDid": {"value": "did is required"},
                  "badMinClusterSize": {"value": "min_cluster_size must be >= 0"},
                  "badJson": {"value": "bad json"},
                  "unknownDid": {"value": "unknown did"}
                }
              }
            }
          },
          "401": {
            "description": "Invalid bearer token (omitting the token is fine)",
            "content": {
              "text/plain": {
                "schema": {"type": "string"},
                "example": "invalid token"
              }
            }
          },
          "429": {
            "description": "Rate limit exceeded (30 req/min per IP, enforced by Caddy)"
          },
          "500": {
            "description": "Internal error (medoid fetch, search, or handle resolution failure)"
          },
          "503": {
            "description": "Too many concurrent searches (shares /search's server-wide limit of 4)",
            "content": {
              "text/plain": {
                "schema": {"type": "string"},
                "example": "too many concurrent searches"
              }
            }
          }
        }
      }
    },
    "/api/v1/embed": {
      "post": {
        "operationId": "embed",
        "summary": "Public batched embeddings",
        "description": "Embed up to N texts and receive per-text vectors. Bearer token holders get full 768d EmbeddingGemma vectors; anonymous callers get 128d Matryoshka-cropped, L2-normalized vectors. The `tasks` array selects one or both embedding spaces — `clustering` for grouping similar texts, `retrieval_doc` to embed against indexed posts (same space the firehose serves). Rate-limit accounting is `n_texts × len(tasks)`, so asking for both doubles your budget burn — pick the one you need.\n\nAnon limits: 10 emb/s/IP (burst 30), max 50 texts/req, 4096 chars/text, 1 MB body. Bearer limits: 100 emb/s/token (burst 300), max 200 texts/req, 8192 chars/text, 4 MB body. Global: 400 emb/s aggregate (auto-throttles to 100 emb/s when the public Mac is under contention).\n\nDecoding the response: each embedding is base64(little-endian float32). Python: `vec = struct.unpack(f'<{n}f', base64.b64decode(s))` where `n = dim`. JavaScript: decode base64 → `new Float32Array(buf)`.",
        "security": [{"bearerAuth": []}, {}],
        "parameters": [
          {
            "name": "X-Bluesky-Handle",
            "in": "header",
            "required": false,
            "schema": {"type": "string"},
            "description": "Your Bluesky handle (e.g. you.bsky.social). Totally optional — just helps us understand who's using the API and reach out for feedback. We'd love to hear from you!"
          }
        ],
        "requestBody": {
          "required": true,
          "content": {
            "application/json": {
              "schema": {"$ref": "#/components/schemas/EmbedRequest"},
              "examples": {
                "retrievalDocAnon": {
                  "summary": "Anonymous: retrieval-doc embeddings (128d, normalized)",
                  "value": {"tasks": ["retrieval_doc"], "texts": ["hello world"]}
                },
                "clusteringBearer": {
                  "summary": "Bearer: clustering embeddings (full 768d)",
                  "value": {"tasks": ["clustering"], "texts": ["a sentence", "another sentence"]}
                },
                "bothTasks": {
                  "summary": "Both task spaces in one call (charges 2× per text)",
                  "value": {"tasks": ["clustering", "retrieval_doc"], "texts": ["hi"]}
                },
                "bearerCropped": {
                  "summary": "Bearer asking for cropped 128d (bandwidth-efficient)",
                  "value": {"tasks": ["retrieval_doc"], "texts": ["hi"], "crop_to_128d": true}
                }
              }
            }
          }
        },
        "responses": {
          "200": {
            "description": "Embeddings keyed by task name. The `embeddings` map has one key per requested task; each value is an array of base64 float32-LE strings aligned with the input `texts`.",
            "content": {
              "application/json": {
                "schema": {"$ref": "#/components/schemas/EmbedResponse"}
              }
            }
          },
          "400": {
            "description": "Invalid request — missing/empty/duplicate tasks, unknown task value, oversized text, too many texts, body over the per-tier cap, or malformed JSON.",
            "content": {
              "application/json": {
                "schema": {"$ref": "#/components/schemas/EmbedError"},
                "examples": {
                  "missingTasks": {"value": {"error": "tasks: required (must be a non-empty array of \"clustering\" or \"retrieval_doc\")"}},
                  "badTaskValue": {"value": {"error": "tasks: each entry must be \"clustering\" or \"retrieval_doc\""}},
                  "duplicateTasks": {"value": {"error": "tasks: duplicate values are not allowed"}},
                  "tooManyTexts": {"value": {"error": "texts: too many; max 50 for this tier"}},
                  "emptyTexts": {"value": {"error": "texts: required (must be a non-empty array)"}},
                  "charLimit": {"value": {"error": "texts: at least one text exceeds the per-text char limit (4096)"}},
                  "bodyTooLarge": {"value": {"error": "request body too large"}},
                  "malformedJSON": {"value": {"error": "malformed JSON"}}
                }
              }
            }
          },
          "401": {
            "description": "An Authorization: Bearer header was supplied but the token is unknown or revoked. Omit the header to fall back to the anonymous tier (128d).",
            "content": {
              "application/json": {
                "schema": {"$ref": "#/components/schemas/EmbedError"},
                "example": {"error": "invalid bearer token"}
              }
            }
          },
          "429": {
            "description": "Rate limit exceeded — anon: per-IP token bucket (10/s, burst 30); bearer: per-token bucket (100/s, burst 300). Wait the seconds in `Retry-After` and try again. Asking for both tasks counts double against the bucket.",
            "headers": {
              "Retry-After": {
                "schema": {"type": "integer"},
                "description": "Seconds to wait before retrying."
              }
            },
            "content": {
              "application/json": {
                "schema": {"$ref": "#/components/schemas/EmbedError"},
                "example": {"error": "rate limit exceeded"}
              }
            }
          },
          "503": {
            "description": "Public dispatcher queue is full (heavy contention) or the upstream Mac is unreachable. Retry after the seconds in `Retry-After`. The `error` body distinguishes queue/upstream/timeout.",
            "headers": {
              "Retry-After": {
                "schema": {"type": "integer"},
                "description": "Seconds to wait before retrying."
              }
            },
            "content": {
              "application/json": {
                "schema": {"$ref": "#/components/schemas/EmbedError"},
                "examples": {
                  "queueFull": {"value": {"error": "queue full; try again shortly"}},
                  "queueTimeout": {"value": {"error": "queue wait timed out; try again shortly"}},
                  "upstream": {"value": {"error": "upstream embedding service unavailable"}}
                }
              }
            }
          }
        }
      }
    }
  },
  "components": {
    "securitySchemes": {
      "bearerAuth": {
        "type": "http",
        "scheme": "bearer"
      }
    },
    "schemas": {
      "SearchRequest": {
        "type": "object",
        "description": "At least one of query, did, dids, or rkey is required. query and rkey are mutually exclusive. rkey requires did. dids cannot be combined with did, rkey, or query. langs, since, until, and min_score are optional filters with per-mode rules — see each property.",
        "properties": {
          "query": {
            "type": "string",
            "description": "Semantic search text. Mutually exclusive with rkey."
          },
          "did": {
            "type": "string",
            "description": "Scope search to an AT Protocol DID. Alone: returns newest posts, always clustered. With query or rkey: scoped search."
          },
          "dids": {
            "type": "array",
            "items": {"type": "string"},
            "minItems": 1,
            "maxItems": 500,
            "description": "Cluster a set of accounts (e.g. your followers) by their precomputed topical medoids. Returns each medoid post in `results` (tagged with `cluster_id`/`topics`) and HDBSCAN groupings in `clusters`. An account can appear in multiple clusters since each account contributes its own per-topic medoids. Cannot be combined with did, rkey, or query."
          },
          "viewer_did": {
            "type": "string",
            "description": "Only with dids: restrict the clustered medoids to the languages this account posts in (derived from its own medoids, cluster-size weighted). The derived languages are echoed in the response `langs`. An unknown viewer_did or one without indexed posts derives no languages and applies no filter."
          },
          "rkey": {
            "type": "string",
            "description": "Use this post's embedding as the query vector, searching within that account's posts (rkey requires did, and did scopes the search). Mutually exclusive with query."
          },
          "limit": {
            "type": "integer",
            "minimum": 1,
            "maximum": 1200,
            "description": "Max results. Global (no did): default and max 400. DID-scoped: default and max 1200. Values ≤0 or above max are clamped to the default."
          },
          "distinct": {
            "type": "boolean",
            "default": true,
            "description": "One result per account (global search only)."
          },
          "cluster": {
            "type": "boolean",
            "default": false,
            "description": "Enable UMAP+HDBSCAN clustering with c-TF-IDF topics. Requires ≥10 results. Forced true for did-only and dids modes."
          },
          "include_embeddings": {
            "type": "boolean",
            "default": false,
            "description": "Include per-result embedding vectors. 128d without bearer, 768d with bearer."
          },
          "langs": {
            "type": "array",
            "items": {"type": "string"},
            "minItems": 1,
            "description": "Only return posts in these languages (ISO 639-1 codes, e.g. [\"de\",\"en\"]). Works in every mode: post searches filter the detected language, dids mode filters the medoid language. Mutually exclusive with viewer_did. Echoed in the response `langs`. In global search the filter applies after the ANN stage, so heavily filtered searches can return fewer than `limit` results."
          },
          "since": {
            "type": "string",
            "format": "date-time",
            "description": "Only return posts created at or after this RFC3339 timestamp. Any post mode (global, did-scoped, browse); cannot be combined with dids (medoids have no time axis). Same post-ANN recall caveat as langs in global search."
          },
          "until": {
            "type": "string",
            "format": "date-time",
            "description": "Only return posts created before this RFC3339 timestamp. Same mode rules as since. Must be after since when both are set."
          },
          "min_score": {
            "type": "number",
            "minimum": -1,
            "maximum": 1,
            "description": "Minimum cosine similarity to the query (embeddings are L2-normalized, so cosine similarity = -score). Results with -score below this are dropped. Any mode with a query vector (query or rkey, global or did-scoped)."
          }
        }
      },
      "SearchResponse": {
        "type": "object",
        "required": ["results"],
        "properties": {
          "results": {
            "type": "array",
            "items": {"$ref": "#/components/schemas/SearchResult"}
          },
          "clusters": {
            "type": "array",
            "items": {"$ref": "#/components/schemas/SearchCluster"},
            "description": "Present when cluster=true and ≥10 results."
          },
          "langs": {
            "type": "array",
            "items": {"type": "string"},
            "description": "The applied language filter: the request's explicit langs, or the languages derived from viewer_did. Absent when no language filter was applied."
          }
        }
      },
      "SearchResult": {
        "type": "object",
        "required": ["did", "collection", "rkey", "text", "score"],
        "properties": {
          "did": {"type": "string", "description": "AT Protocol DID of the post author."},
          "handle": {"type": "string", "description": "Bluesky handle. Omitted if unresolved."},
          "collection": {"type": "string", "description": "AT Protocol collection NSID (e.g. app.bsky.feed.post)."},
          "rkey": {"type": "string", "description": "Record key within the collection."},
          "text": {"type": "string", "description": "Post text content."},
          "score": {"type": "number", "description": "Negative inner product (lower = more similar)."},
          "created_at": {"type": "string", "format": "date-time", "description": "Post creation time (RFC 3339). Omitted if unavailable."},
          "detected_lang": {"type": "string", "description": "Detected language code (e.g. en, de). Omitted if unavailable."},
          "cluster_id": {"type": "integer", "description": "Cluster assignment. Present only when clustering is active."},
          "topics": {"type": "array", "items": {"type": "string"}, "description": "c-TF-IDF topic terms for this result's cluster. Present only when clustering is active."},
          "embedding": {"type": "array", "items": {"type": "number"}, "description": "Embedding vector (128d public, 768d bearer). Present only when include_embeddings=true."}
        }
      },
      "SearchCluster": {
        "type": "object",
        "required": ["id", "size", "result_indices"],
        "properties": {
          "id": {"type": "integer", "description": "Cluster ID (sequential from 0)."},
          "size": {"type": "integer", "description": "Total posts in this cluster."},
          "topics": {"type": "array", "items": {"type": "string"}, "description": "c-TF-IDF extracted topic terms."},
          "result_indices": {"type": "array", "items": {"type": "integer"}, "description": "Indices into the results array belonging to this cluster."},
          "medoid_index": {"type": "integer", "description": "Index into results array of the cluster's most representative post."},
          "medoid_embedding": {"type": "array", "items": {"type": "number"}, "description": "Embedding of the medoid post (128d public, 768d bearer)."}
        }
      },
      "MedoidsRequest": {
        "type": "object",
        "required": ["dids"],
        "properties": {
          "dids": {
            "type": "array",
            "items": {"type": "string"},
            "minItems": 1,
            "maxItems": 25,
            "description": "AT Protocol DIDs to fetch medoids for."
          }
        }
      },
      "MedoidsResponse": {
        "type": "object",
        "required": ["accounts"],
        "properties": {
          "accounts": {
            "type": "object",
            "additionalProperties": {"$ref": "#/components/schemas/AccountMedoids"},
            "description": "Map from DID to its medoids. DIDs not found are omitted."
          }
        }
      },
      "AccountMedoids": {
        "type": "object",
        "required": ["medoids"],
        "properties": {
          "medoids": {
            "type": "array",
            "items": {"$ref": "#/components/schemas/Medoid"},
            "description": "Account's top medoids, sorted by cluster size descending."
          },
          "account_embedding": {
            "type": "array",
            "items": {"type": "number"},
            "description": "Single account-level embedding: cluster-size-weighted mean of ALL the account's medoids (not just the top 3 returned), L2-normalized. 128d public, 768d bearer. Absent when the account has no embedded medoids."
          }
        }
      },
      "Medoid": {
        "type": "object",
        "required": ["cluster_id", "is_primary", "collection", "rkey", "cluster_size", "embedding"],
        "properties": {
          "cluster_id": {"type": "integer", "description": "Cluster ID within the account."},
          "is_primary": {"type": "boolean", "description": "Whether this is the largest cluster."},
          "collection": {"type": "string", "description": "AT Protocol collection NSID (e.g. app.bsky.feed.post)."},
          "rkey": {"type": "string", "description": "Record key of the medoid post."},
          "cluster_size": {"type": "integer", "description": "Number of posts in this cluster."},
          "embedding": {"type": "array", "items": {"type": "number"}, "description": "Cluster embedding (128d L2-normalized public, 768d bearer)."},
          "topics": {"type": "array", "items": {"type": "string"}, "description": "c-TF-IDF topic keywords for this cluster. English/German only — medoids in other languages come back without topics."}
        }
      },
      "SimilarAccountsRequest": {
        "type": "object",
        "required": ["did"],
        "properties": {
          "did": {
            "type": "string",
            "description": "AT Protocol DID of the subject account."
          },
          "limit": {
            "type": "integer",
            "minimum": 1,
            "maximum": 100,
            "description": "Max accounts returned. Default and max 100; values ≤0 or above max are clamped. The margin baseline always uses the full candidate pool, so a small limit doesn't shift the margins."
          },
          "min_cluster_size": {
            "type": "integer",
            "minimum": 0,
            "default": 0,
            "description": "Suppress noisy one-off topic clusters on both sides: the subject's topic clusters below this size contribute no query vector, and candidate accounts can only match via clusters at or above it. 0 = no floor. Note: accounts with few indexed posts have a single small cluster, so high values exclude small accounts entirely."
          },
          "langs": {
            "type": "array",
            "items": {"type": "string"},
            "description": "Language scope on the candidates' matching topic clusters (ISO 639-1). Omitted: derived from the subject's own posting languages (≥5% of its topic mass) — the recommended default. Empty array []: no language filter (cross-lingual neighbors). Explicit list: exactly those languages. The applied scope is echoed in the response `langs`."
          }
        }
      },
      "SimilarAccountsResponse": {
        "type": "object",
        "required": ["accounts", "score_mean"],
        "properties": {
          "accounts": {
            "type": "array",
            "items": {"$ref": "#/components/schemas/SimilarAccount"},
            "description": "Ranked by score descending."
          },
          "score_mean": {
            "type": "number",
            "description": "Mean score over the full internal candidate pool — the centering anchor the margins are computed against (score = margin + score_mean). 0 when accounts is empty."
          },
          "langs": {
            "type": "array",
            "items": {"type": "string"},
            "description": "The applied language scope: the request's explicit langs, or the languages derived from the subject. Absent when no language filter was applied."
          }
        }
      },
      "SimilarAccount": {
        "type": "object",
        "required": ["did", "score", "margin", "matched"],
        "properties": {
          "did": {"type": "string", "description": "AT Protocol DID of the similar account."},
          "handle": {"type": "string", "description": "Bluesky handle. Omitted if none recorded."},
          "score": {
            "type": "number",
            "description": "Cross-topic affinity: log-sum-exp pooling (α=3) of the cosine similarities between this account's best-matching medoid and all of the subject's query medoids. Matching several of the subject's topics scores above a single perfect-topic match, so values can exceed 1. Relative — compare within one response only."
          },
          "margin": {
            "type": "number",
            "description": "score − score_mean. The discriminative number: raw scores band tightly, margins spread. Treat ≤ 0 as noise; multiply by your own reach/recency signals for ranking."
          },
          "matched": {"$ref": "#/components/schemas/MatchedMedoid"}
        }
      },
      "MatchedMedoid": {
        "type": "object",
        "required": ["collection", "rkey", "cluster_size"],
        "description": "The candidate account's medoid post that sat nearest to one of the subject's medoids — the concrete \"why\" behind the match. Fetch the post via the Bluesky API (did + collection + rkey) to render it.",
        "properties": {
          "collection": {"type": "string", "description": "AT Protocol collection NSID (e.g. app.bsky.feed.post)."},
          "rkey": {"type": "string", "description": "Record key of the matched medoid post."},
          "cluster_size": {"type": "integer", "description": "Number of posts in the matched topic cluster — the mass behind the match."}
        }
      },
      "EmbeddingBatch": {
        "type": "object",
        "required": ["did", "col", "rkey", "lang", "c", "r"],
        "description": "Columnar batch of embeddings. All arrays have the same length. Empty arrays (length 0) indicate a heartbeat.",
        "properties": {
          "did": {"type": "array", "items": {"type": "string"}, "description": "AT Protocol DIDs."},
          "col": {"type": "array", "items": {"type": "string"}, "description": "AT Protocol collection NSIDs (e.g. app.bsky.feed.post)."},
          "rkey": {"type": "array", "items": {"type": "string"}, "description": "Record keys."},
          "lang": {"type": "array", "items": {"type": "string"}, "description": "Detected language codes (e.g. en, de)."},
          "c": {"type": "array", "items": {"type": "array", "items": {"type": "number"}}, "description": "Cluster embeddings (128d public, 768d bearer)."},
          "r": {"type": "array", "items": {"type": "array", "items": {"type": "number"}}, "description": "Retrieval embeddings (128d public, 768d bearer)."}
        }
      },
      "EmbedRequest": {
        "type": "object",
        "required": ["tasks", "texts"],
        "properties": {
          "tasks": {
            "type": "array",
            "minItems": 1,
            "maxItems": 2,
            "uniqueItems": true,
            "items": {"type": "string", "enum": ["clustering", "retrieval_doc"]},
            "description": "Required. One or two distinct task spaces. `clustering` is good for grouping similar texts; `retrieval_doc` is the same space we use to index posts (use it if you plan to query against indexed posts later). Each task space adds 1× to the rate-limit cost per text."
          },
          "texts": {
            "type": "array",
            "minItems": 1,
            "items": {"type": "string"},
            "description": "Texts to embed. Anon: max 50 entries × 4096 chars. Bearer: max 200 entries × 8192 chars."
          },
          "crop_to_128d": {
            "type": "boolean",
            "default": false,
            "description": "Bearer-only override to receive 128d Matryoshka-cropped + normalized embeddings instead of full 768d (saves bandwidth). Anonymous callers always get 128d regardless of this flag."
          }
        }
      },
      "EmbedResponse": {
        "type": "object",
        "required": ["tasks", "dim", "embeddings", "request_id"],
        "properties": {
          "tasks": {
            "type": "array",
            "items": {"type": "string"},
            "description": "Echoed back from request — the task spaces that the response covers."
          },
          "dim": {
            "type": "integer",
            "enum": [128, 768],
            "description": "Dimensionality of returned vectors. 128 for anon (or bearer with crop_to_128d=true). 768 for bearer default."
          },
          "embeddings": {
            "type": "object",
            "description": "Keyed by task name. Each value is an array of base64-encoded little-endian float32 vectors aligned with the input `texts` array. Decode with: bytes = base64.b64decode(s); vec = struct.unpack(f'<{len(bytes)//4}f', bytes).",
            "additionalProperties": {
              "type": "array",
              "items": {"type": "string"}
            }
          },
          "request_id": {
            "type": "string",
            "format": "uuid",
            "description": "Server-generated UUID. Quote it if you reach out about a specific request — we can correlate it with our logs."
          },
          "token_handle": {
            "type": "string",
            "nullable": true,
            "description": "Echoes back the bearer token's owner handle. Null for anonymous requests."
          }
        }
      },
      "EmbedError": {
        "type": "object",
        "required": ["error"],
        "properties": {
          "error": {
            "type": "string",
            "description": "Human-readable error message."
          }
        }
      }
    }
  }
}
