Back to catalogue
ContextPreToolUse· WebFetchPreToolUseBefore tool execution · can block⚡ blocking
WebFetch HTML-to-Markdown converter
Fetch any web page as clean Markdown — strip boilerplate HTML before it enters context
Intercepts WebFetch calls, fetches the URL with curl, detects HTML responses, and converts them to Markdown via pandoc (falling back to a built-in tag stripper). Injects the converted content directly, skipping the raw HTML that Claude would otherwise process. Non-HTML responses (JSON APIs, plain text) pass through unchanged. Truncates to 30 000 chars.
What does the WebFetch HTML-to-Markdown converter hook do?
WebFetch HTML-to-Markdown converter is a Claude Code PreToolUse hook matching WebFetch. It fires automatically at that lifecycle event — outside the model, so it can't be skipped or forgotten. Fetch any web page as clean Markdown — strip boilerplate HTML before it enters context.
Use cases
- Reading documentation pages without loading CSS/JS boilerplate
- Extracting article content from blog posts
- Reducing token usage during web research sessions
Tags
#context-optimization#html#markdown#tokens#webfetch#productivity
settings.json fragment
{
"hooks": {
"PreToolUse": [
{
"hooks": [
{
"command": "node $CLAUDE_PROJECT_DIR/.claude/hooks/pre-webfetch-html-to-markdown.mjs",
"type": "command"
}
],
"matcher": "WebFetch"
}
]
}
}Script · .claude/hooks/pre-webfetch-html-to-markdown.mjs
#!/usr/bin/env node
// Convertit les pages HTML en Markdown avant traitement WebFetch (PreToolUse WebFetch)
import { readFileSync } from 'fs';
import { execSync } from 'child_process';
import { fileURLToPath } from 'url';
const MAX_CHARS = 30_000;
function defaultFetchUrl(url) {
return execSync('curl -sL --max-time 10 --user-agent "Mozilla/5.0" "$HOOK_URL"', {
encoding: 'utf8',
timeout: 15_000,
env: { ...process.env, HOOK_URL: url },
}).trim();
}
function defaultConvertHtml(html) {
return execSync('pandoc -f html -t markdown --wrap=none', {
input: html,
encoding: 'utf8',
timeout: 10_000,
}).trim();
}
function defaultHasPandoc() {
try { execSync('which pandoc', { encoding: 'utf8', timeout: 2_000 }); return true; } catch { return false; }
}
function stripHtml(html) {
return html
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"').replace(/'/g, "'").replace(/ /g, ' ')
.replace(/[ \t]{2,}/g, ' ')
.replace(/\n{3,}/g, '\n\n')
.trim();
}
export function run(input, {
fetchUrl = defaultFetchUrl,
convertHtml = defaultConvertHtml,
hasPandoc = defaultHasPandoc,
} = {}) {
if (input.tool_name !== 'WebFetch') return null;
const url = input.tool_input?.url ?? '';
if (!url || !/^https?:\/\//i.test(url)) return null;
let html;
try { html = fetchUrl(url); } catch { return null; }
if (!html?.trim()) return null;
// Uniquement les pages HTML — laisser passer JSON, binaires, etc.
if (!/<html|<!doctype\s+html/i.test(html.slice(0, 2000))) return null;
let markdown;
try {
markdown = hasPandoc() ? convertHtml(html) : stripHtml(html);
} catch {
try { markdown = stripHtml(html); } catch { return null; }
}
if (!markdown?.trim()) return null;
let content = markdown.trim();
let truncated = false;
if (content.length > MAX_CHARS) {
content = content.slice(0, MAX_CHARS);
truncated = true;
}
let domain;
try { domain = new URL(url).hostname; } catch { domain = url; }
const suffix = truncated ? ` (truncated to ${MAX_CHARS} chars)` : '';
return {
decision: 'block',
reason: `[webfetch-html-to-markdown] \`${domain}\` converted to Markdown${suffix}:\n\n${content}`,
};
}
/* v8 ignore next 5 */
if (process.argv[1] === fileURLToPath(import.meta.url)) {
const input = JSON.parse(readFileSync(0, 'utf8'));
const result = run(input);
if (result) process.stdout.write(JSON.stringify(result));
}