# Per RFC 9309: a crawler picks the single most-specific matching User-agent
# group and ignores all others, including this wildcard group. Every
# specific-UA group below must therefore repeat the five Disallow paths —
# they are NOT inherited. (Bot review on PR #90 caught this for the new
# Googlebot* groups; the pre-existing AI groups had the same latent bug.)
#
# /v1/ blocks API endpoints. Google was 401-crawling /v1/companies/* until
# this line was added — robots.txt was guarding /api/ (Next.js proxy routes)
# but not /v1/ (the NestJS upstream API).
User-agent: *
Allow: /companies/
Allow: /jurisdictions/
Allow: /addresses/
Allow: /glossary
Allow: /compare/
Disallow: /api/
Disallow: /v1/
Disallow: /search
Disallow: /dashboard
Disallow: /_next/

# AI training crawlers — blocked. GPTBot feeds future GPT model training and
# returns nothing to us today (citations live behind OAI-SearchBot, not GPTBot
# — see https://platform.openai.com/docs/bots). 675k hits over the last 5
# days, ~95% of remaining bot bandwidth after the meta/Ahrefs/Semrush cut.
User-agent: GPTBot
Disallow: /

# AI product crawlers — allowed. These feed live product surfaces with
# source citations (ChatGPT Search, on-demand browsing, Perplexity), so the
# cost buys real referral traffic. Each group repeats the four wildcard
# Disallows because RFC 9309 does not inherit them.
User-agent: OAI-SearchBot
Allow: /
Disallow: /api/
Disallow: /v1/
Disallow: /search
Disallow: /dashboard
Disallow: /_next/
Crawl-delay: 2

User-agent: ChatGPT-User
Allow: /
Disallow: /api/
Disallow: /v1/
Disallow: /search
Disallow: /dashboard
Disallow: /_next/

# ClaudeBot (Anthropic's training crawler) — blocked. Same logic as GPTBot
# above: training crawl with no citation surface on the product side.
# Claude-User (the on-demand fetch bot that powers user-triggered browsing
# in claude.ai) stays allowed below — that one actually drives referral
# traffic. Crawling at ~63k req/36h before this block (99% of bot bandwidth).
User-agent: ClaudeBot
Disallow: /

User-agent: Claude-User
Allow: /
Disallow: /api/
Disallow: /v1/
Disallow: /search
Disallow: /dashboard
Disallow: /_next/

User-agent: PerplexityBot
Allow: /
Disallow: /api/
Disallow: /v1/
Disallow: /search
Disallow: /dashboard
Disallow: /_next/
Crawl-delay: 2

User-agent: Google-Extended
Allow: /
Disallow: /api/
Disallow: /v1/
Disallow: /search
Disallow: /dashboard
Disallow: /_next/

# Google search — unthrottled. They barely crawl us right now (3 hits/hour
# vs 8.6k/hour from meta-externalagent); we want every crawl budget unit
# Google is willing to spend post-soft-404-fix. No crawl-delay anywhere.
# Each group repeats the four wildcard Disallows for the same reason.
User-agent: Googlebot
Allow: /
Disallow: /api/
Disallow: /v1/
Disallow: /search
Disallow: /dashboard
Disallow: /_next/

User-agent: Googlebot-Image
Allow: /
Disallow: /api/
Disallow: /v1/
Disallow: /search
Disallow: /dashboard
Disallow: /_next/

User-agent: Googlebot-News
Allow: /
Disallow: /api/
Disallow: /v1/
Disallow: /search
Disallow: /dashboard
Disallow: /_next/

User-agent: GoogleOther
Allow: /
Disallow: /api/
Disallow: /v1/
Disallow: /search
Disallow: /dashboard
Disallow: /_next/

# Blocked — AI training crawl with zero referral traffic returned, OR SEO
# recon tools used by competitors. Burning ~95% of origin bandwidth between
# them as of 2026-05-18 (see post-fix traffic retro). Explicit Disallow
# everything, not Crawl-delay — these UAs have a track record of ignoring
# delays.
# Meta crawler fleet — blocking everything. After PR #90 blocked
# meta-externalagent, Meta rotated load onto meta-webindexer at the same
# ~10k req/h rate within 12h. Same documented endpoint, different UA token.
# Listing every UA Meta documents at
# https://developers.facebook.com/docs/sharing/webmasters/crawler/ plus the
# legacy Facebot token. Includes facebookexternalhit — that will disable
# Facebook / Instagram / WhatsApp / Threads link previews for our URLs.
# We don't expect meaningful social sharing of registry pages; revisit if
# that assumption changes.
User-agent: meta-externalagent
Disallow: /

User-agent: meta-externalfetcher
Disallow: /

User-agent: meta-webindexer
Disallow: /

User-agent: facebookexternalhit
Disallow: /

User-agent: facebookcatalog
Disallow: /

User-agent: Facebot
Disallow: /

User-agent: AhrefsBot
Disallow: /

User-agent: SemrushBot
Disallow: /

User-agent: DotBot
Disallow: /

Sitemap: https://registry-lookup.com/sitemap-index.xml
Sitemap: https://registry-lookup.com/sitemaps/sitemap-companies-index.xml
Sitemap: https://registry-lookup.com/sitemaps/sitemap-addresses-index.xml