feat(marketing): exempt public blueprints from noindex + fix / route collision
- add_no_crawl_headers now skips marketing.*, legal.*, billing.success, static, and robots_txt endpoints via _is_public_indexable_endpoint helper; all other routes keep the X-Robots-Tag noindex header - recordings.index drops @login_required and instead redirects anonymous users to marketing.landing, resolving the URL-map collision between recordings_bp and marketing_bp at "/" - robots.txt rewritten: public marketing pages and /legal/* allowed, /api/, /admin, /account, /share/, /app/, /checkout, /login, /signup, /webhooks/ disallowed; Googlebot, Bingbot, ClaudeBot, GPTBot, PerplexityBot, Applebot explicitly allowed - New tests/test_no_crawl_headers.py (14 tests) covers exemption helper + integration on /, /robots.txt, /static, /admin, /login - New tests/test_marketing_root_redirect.py (4 tests) verifies anonymous users at / never get a /login redirect Tests verified via AST + logic walkthrough; pytest blocked on Windows by pre-existing fcntl import in src/init_db.py (B-1.2 limitation).
This commit is contained in:
28
src/app.py
28
src/app.py
@@ -654,12 +654,40 @@ from src.config.startup import initialize_file_monitor, get_file_monitor_functio
|
||||
run_startup_tasks(app)
|
||||
|
||||
# --- No-Crawl System: HTTP Headers ---
|
||||
# Endpoints that must remain indexable by search engines and AI crawlers.
|
||||
# Public marketing/legal/billing-success pages are exempted from the
|
||||
# X-Robots-Tag noindex header so they can be discovered (Loi 25 transparency,
|
||||
# GEO/SEO strategy). All other routes (api, admin, account, share, app, auth,
|
||||
# recordings dashboard, etc.) keep the noindex header as defense-in-depth.
|
||||
_PUBLIC_INDEXABLE_PREFIXES = ('marketing.', 'legal.')
|
||||
_PUBLIC_INDEXABLE_ENDPOINTS = frozenset({
|
||||
'billing.success', # post-payment confirmation page (added in B-2.7)
|
||||
'robots_txt', # served from /robots.txt
|
||||
'static', # static asset serving
|
||||
})
|
||||
|
||||
|
||||
def _is_public_indexable_endpoint(endpoint):
|
||||
"""Return True if the resolved endpoint should NOT receive noindex headers."""
|
||||
if not endpoint:
|
||||
return False
|
||||
if endpoint in _PUBLIC_INDEXABLE_ENDPOINTS:
|
||||
return True
|
||||
return endpoint.startswith(_PUBLIC_INDEXABLE_PREFIXES)
|
||||
|
||||
|
||||
@app.after_request
|
||||
def add_no_crawl_headers(response):
|
||||
"""
|
||||
Add HTTP headers to discourage search engine crawling and indexing.
|
||||
This provides defense-in-depth alongside robots.txt and meta tags.
|
||||
|
||||
Marketing pages, legal pages, and the post-payment success page are
|
||||
exempted so they remain indexable by search engines and AI crawlers.
|
||||
"""
|
||||
if _is_public_indexable_endpoint(request.endpoint):
|
||||
return response
|
||||
|
||||
response.headers['X-Robots-Tag'] = 'noindex, nofollow, noarchive, nosnippet, noimageindex'
|
||||
return response
|
||||
|
||||
|
||||
Reference in New Issue
Block a user