# robots.txt
#
# Disallow file & directory paths from search engines
#
# Specs: http://www.robotstxt.org/robotstxt.html

# (Google) Search crawler
# https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers
User-agent: Googlebot
Allow: /
Disallow: /now
Disallow: /assets/fonts/

# (Google) Images crawler
User-agent: Googlebot-Image
Allow: /
Disallow: /now
Disallow: /assets/fonts/

# (Google) Mobile crawler
User-agent: Googlebot-Mobile
Allow: /
Disallow: /now
Disallow: /assets/fonts/

# (Google) News crawler
User-agent: Googlebot-News
Disallow: /

# (Google) Video crawler
User-agent: Googlebot-Video
Disallow: /

# (Google) "used by Search testing tools"
User-agent: Google-InspectionTool
Allow: /
Disallow: /now
Disallow: /assets/fonts/

# (Google) "generic crawler that may be used by various product teams"
User-agent: GoogleOther
Disallow: /

# (Google) Mediapartners crawler
User-agent: Mediapartners-Google
Disallow: /

# (Google) Ads crawler
User-agent: AdsBot-Google
Disallow: /

# (Google) Ecommerce crawler
User-agent: Storebot-Google
Disallow: /

# (Google) Search Appliance crawler
# https://www.google.com/support/enterprise/static/gsa/docs/admin/current/gsa_doc_set/admin_crawl/preparing.html
User-agent: gsa-crawler
Disallow: /

# (Google) AI crawler
# https://searchengineland.com/google-extended-crawler-432636
# https://darkvisitors.com/agents/google-extended
User-agent: Google-Extended
Disallow: /

# (OpenAI, L.L.C.) ChatGPT crawler 1
# https://platform.openai.com/docs/bots
# https://darkvisitors.com/agents/chatgpt-user
# "ChatGPT-User is dispatched by OpenAI's ChatGPT in response to user prompts."
User-agent: ChatGPT-User
Disallow: /

# (OpenAI, L.L.C.) ChatGPT crawler 2
# https://platform.openai.com/docs/bots
# https://darkvisitors.com/agents/gptbot
User-agent: GPTBot
Disallow: /

# (OpenAI, L.L.C.) OAI-SearchBot
# https://openai.com/index/searchgpt-prototype/
# "a temporary prototype of new AI search features"
# https://platform.openai.com/docs/bots
# "OAI-SearchBot is for search."
User-agent: OAI-SearchBot
Disallow: /

# Anthropic crawler 1
# ("Claude is an AI assistant built by Anthropic")
# https://darkvisitors.com/agents/claudebot
User-agent: ClaudeBot
Disallow: /

# Anthropic crawler 2
# https://darkvisitors.com/agents/claude-web
User-agent: Claude-Web
Disallow: /

# Anthropic crawler 3
# https://darkvisitors.com/agents/anthropic-ai
# "anthropic-ai is [...] used [...] to download training data for its LLMs"
User-agent: anthropic-ai
Disallow: /
https://darkvisitors.com/agents/claude-web

# X/Twitter
User-agent: Twitterbot
Disallow: /

# (Meta) FacebookExternalHit
# https://developers.facebook.com/docs/sharing/webmasters/web-crawlers
# "The primary purpose of FacebookExternalHit is to crawl
# the content of an app or website that was shared on
# one of Meta’s family of apps, such as Facebook, Instagram, or Messenger."
# User-agent: facebookexternalhit
# User-agent: facebookcatalog

# (Meta) Facebook AI crawler
# https://developers.facebook.com/docs/sharing/bot/
# "FacebookBot crawls public web pages to improve language models
# for our speech recognition technology. We use polite web crawling
# protocols that respect standard robots.txt rules."
# https://darkvisitors.com/agents/facebookbot
User-agent: FacebookBot
Disallow: /

# (Meta) Meta-ExternalAgent
# https://developers.facebook.com/docs/sharing/webmasters/web-crawlers
# "The Meta-ExternalAgent crawler crawls the web for use cases such as
# training AI models or improving products by indexing content directly."
User-agent: meta-externalagent
Disallow: /

# Meta-ExternalFetcher
# https://developers.facebook.com/docs/sharing/webmasters/web-crawlers
# "The Meta-ExternalFetcher crawler performs user-initiated
# fetches of individual links to support specific product functions.
# Because the fetch was initiated by a user,
# this crawler may bypass robots.txt rules."
# User-agent: meta-externalfetcher
# Disallow: /

# Common Crawl
# https://commoncrawl.org/ccbot
# https://darkvisitors.com/agents/ccbot
User-agent: CCBot
Disallow: /

# Cohere crawler
# ("Enterprise AI Platform")
# https://cohere.com/
# https://darkvisitors.com/agents/cohere-ai
User-agent: cohere-ai
Disallow: /

# Omgili crawler 1
# https://webz.io/
# https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/
# https://darkvisitors.com/agents/omgili
# "Omgili is a web crawler used by Webz.io to maintain a repository of
# web crawl data that it sells to other companies,
# including those using it to train AI models."
User-agent: omgilibot
Disallow: /

# Omgili crawler 2
# https://darkvisitors.com/agents/omgili
User-agent: omgili
Disallow: /

# Amazon crawler
User-agent: Amazonbot
Disallow: /

# Perplexity crawler
User-agent: PerplexityBot
Disallow: /

# YouBot crawler
User-agent: YouBot
Disallow: /

# Diffbot
# "Web Data for your AI"
# https://www.diffbot.com/
# https://darkvisitors.com/agents/diffbot
User-agent: Diffbot
Disallow: /

# (Timpi Inc.) Timpibot
# https://darkvisitors.com/agents/timpibot
# "Timpibot is used by Timpi's decentralized network of [...] node operators."
User-agent: Timpibot
Disallow: /

# Internet Archive
# https://archive.org/details/archive.org_bot
# https://whatmyuseragent.com/bots/archive-org-bot
User-agent: archive.org_bot
Disallow: /now
Disallow: /assets/fonts/

# DuckDuckGo crawler
User-agent: DuckDuckBot
Allow: /
Disallow: /now
Disallow: /assets/fonts/

# Yahoo! Search crawler
User-Agent: Slurp
Allow: /
Disallow: /now
Disallow: /assets/fonts/

# (Microsoft) Search crawler
User-Agent: msnbot
Allow: /
Disallow: /now
Disallow: /assets/fonts/

# (Microsoft) MSNBot-Media Images crawler
User-agent: msnbot-media
Allow: /
Disallow: /now
Disallow: /assets/fonts/

# (Microsoft) Bing Search crawler
# https://blogs.bing.com/webmaster/May-2012/To-crawl-or-not-to-crawl,-that-is-BingBot-s-questi/
User-Agent: bingbot
Allow: /
Disallow: /now
Disallow: /assets/fonts/

# (Microsoft) Bing Ads crawler
User-agent: adidxbot
Disallow: /

# Twitter crawler
# https://udger.com/resources/ua-list/bot-detail?bot=Twitterbot
User-agent: Twitterbot/1.0
Allow: /
Disallow: /now
Disallow: /assets/fonts/

# Apple crawler (Siri)
# https://support.apple.com/en-gb/HT204683
# https://darkvisitors.com/agents/applebot
User-agent: Applebot
Allow: /
Disallow: /now
Disallow: /assets/fonts/

# Apple crawler ("AI")
# https://darkvisitors.com/agents/applebot-extended
User-agent: Applebot-Extended
Disallow: /

# (Castle Global, Inc.) Imagesift by Hive
# https://imagesift.com/about
# "ImageSiftBot is a web crawler that scrapes the internet for
# publicly available images to support our suite of web intelligence products"
# https://thehive.ai/privacy
User-Agent: ImagesiftBot
Disallow: /

# (Ask) Teoma crawler
# https://en.wikipedia.org/wiki/Teoma
User-agent: Teoma
Allow: /
Disallow: /now
Disallow: /assets/fonts/

# (Mojeek Limited) generic crawler
# https://www.mojeek.com/bot.html
User-agent: MojeekBot
Allow: /
Disallow: /now
Disallow: /assets/fonts/

# (SEOmoz) rogerbot crawler
# https://udger.com/resources/ua-list/bot-detail?bot=rogerbot
# https://moz.com/help/moz-procedures/crawlers/rogerbot

# (SEOmoz) dotbot crawler
# https://udger.com/resources/ua-list/bot-detail?bot=DotBot
# https://moz.com/help/moz-procedures/crawlers/dotbot
User-agent: dotbot
Disallow: /

# SemrushBot crawler
# https://udger.com/resources/ua-list/bot-detail?bot=SemrushBot
# https://www.semrush.com/bot/
User-agent: SemrushBot
Disallow: /

# DeepCrawl crawler
# https://udger.com/resources/ua-list/bot-detail?bot=deepcrawl
# https://www.deepcrawl.com/bot/
User-agent: deepcrawl
Disallow: /

# (Majestic) MJ12bot crawler
# https://udger.com/resources/ua-list/bot-detail?bot=MJ12bot
User-agent: MJ12bot
Disallow: /

# 360Spider crawler
# https://udger.com/resources/ua-list/bot-detail?bot=360Spider
User-agent: 360Spider
Disallow: /

# Sogou crawler
# https://useragentstring.com/sogou%20spider_id_5749.php
User-agent: Sogou
Disallow: /

# Yandex Search crawler
User-agent: Yandex
Disallow: /

# Rambler crawler
# https://useragentstring.com/index.php?id=4238
User-agent: StackRambler
Disallow: /

# Unwanted crawler
User-agent: Mail.RU_Bot
Disallow: /

# MegaIndex crawler 1
User-agent: MegaIndex
Disallow: /

# MegaIndex crawler 2
User-agent: MegaIndex.ru
Disallow: /

# (Baidu) General crawler
# https://yoast.com/ultimate-guide-robots-txt/
User-agent: baiduspider
Disallow: /

# (Baidu) Images crawler
User-agent: baiduspider-image
Disallow: /

# (Baidu) Mobile crawler
User-agent: baiduspider-mobile
Disallow: /

# (Baidu) News crawler
User-agent: baiduspider-news
Disallow: /

# (Baidu) Video crawler
User-agent: baiduspider-video
Disallow: /

# (Baidu) generic crawler
User-agent: baidu
Disallow: /

# (ByteDance/TikTok/Doubao) ByteSpider crawler
# https://darkvisitors.com/agents/bytespider
# "used to download training data for its LLMs"
User-agent: ByteSpider
Disallow: /

# (Taobao) eTao
# "product search engine"
User-agent: EtaoSpider
Disallow: /

# (Kakao Corp.) Daumoa crawler
# https://udger.com/resources/ua-list/bot-detail?bot=Daumoa
User-agent: daumoa
Disallow: /

# (Kakao Corp.) DAUM crawler
User-agent : DAUM
Disallow : /

# (NAVER Corp) NAVER crawler
# https://udger.com/resources/ua-list/bot-detail?bot=naverbot
User-agent: naverbot
Disallow: /

# (NAVER Corp) Yeti crawler
User-agent: Yeti
Disallow: /

# Neevabot crawler
# https://udger.com/resources/ua-list/bot-detail?bot=Neevabot
# https://neeva.com/neevabot
User-agent: Neevabot
Disallow: /

# Zoombot crawler
# https://udger.com/resources/ua-list/bot-detail?bot=Zoombot
User-agent: Zoombot
Disallow: /

# OrangeBot crawler
# https://udger.com/resources/ua-list/bot-detail?bot=OrangeBot
User-agent: OrangeBot
Disallow: /

# SeznamBot crawler
# https://napoveda.seznam.cz/en/seznambot-crawler/
User-agent: seznambot
Disallow: /

# Pinterest crawler
# https://help.pinterest.com/en/business/article/pinterest-crawler
user-agent: Pinterestbot
disallow: /

# Semrush
User-agent: SemrushBot
Disallow: /

# (Datafiniti, LLC) 008 crawler
# https://80legs.com/
# https://en.wikipedia.org/wiki/80legs
User-agent: 008
Disallow: /

# (aiHit Ltd.)
# https://www.aihitdata.com/about
# https://whatmyuseragent.com/bots/aihitbot
# "Company Database"
User-agent: aiHitBot
Disallow: /

# Dataprovider.com crawler (used by Google, etc.)
# https://www.dataprovider.com/cases/
# https://www.dataprovider.com/spider/
User-agent: Dataprovider.com
Allow: /
Disallow: /now
Disallow: /assets/fonts/

# Every bot that might possibly read and respect this file
User-agent: *
Disallow: /now
Disallow: /assets/fonts/

# Wait 1 second between successive requests.
Crawl-delay: 1