I have a script to crawl all links in the page and populate links’ titles with title tag content from the pages. Although it seemed to be running fine when i tested it (I had to comment out lines where I work with KV though), after deployment all I get is 522 after some time. I have A DNS record for my subdomain which points to 192.0.2.1 and proxy status is “Proxied”, I also have CNAME for www record which points to the subdomain… I set up router, created KV, added KV namespace binding for it…
/**
* Main worker entry point
*/
addEventListener("fetch", event => {
// Fail-safe in case of an unhandled exception
console.log('hey');
event.passThroughOnException();
const url = new URL(event.request.url);
if (event.request.method === 'GET') {
event.respondWith(handleRequest(event.request, event));
}
});
// Workers can only decode utf-8 so keep a list of character encodings that can be decoded.
const VALID_CHARSETS = ['utf-8', 'utf8', 'iso-8859-1', 'us-ascii'];
/**
* Handle all non-proxied requests. Send HTML for further processing
* and pass everything else through unmodified.
* @param {*} request - Original request
* @param {*} event - Original worker event
*/
async function handleRequest(request, event) {
// Try to get cache
let cache = caches.default;
let response = await cache.match(event.request);
if (response) {
return response;
}
response = await fetch(request);
if (response && response.status === 200) {
const contentType = response.headers.get("content-type");
if (contentType && contentType.indexOf("text/html") !== -1) {
response = await processHtmlResponse(response, event.request, event);
event.waitUntil(cache.put(event.request, response.clone()));
return response;
}
}
return response;
}
/**
* Handle all of the processing for a (likely) HTML request.
* - Pass through the request to the origin and inspect the response.
* - If the response is HTML set up a streaming transform and pass it on to modifyHtmlStream for processing
*
* Extra care needs to be taken to make sure the character encoding from the original
* HTML is extracted and converted to utf-8 and that the downstream response is identified
* as utf-8.
*
* @param {*} response The original response
* @param {*} request The original request
* @param {*} event worker event object
*/
async function processHtmlResponse(response, request, event) {
// Workers can only decode utf-8. If it is anything else, pass the
// response through unmodified
const contentType = response.headers.get("content-type");
const charsetRegex = /charset\s*=\s*([^\s;]+)/mgi;
const match = charsetRegex.exec(contentType);
if (match !== null) {
let charset = match[1].toLowerCase();
if (!VALID_CHARSETS.includes(charset)) {
return response;
}
}
// Create an identity TransformStream (a.k.a. a pipe).
// The readable side will become our new response body.
const { readable, writable } = new TransformStream();
// Create a cloned response with our modified stream
const newResponse = new Response(readable, response);
// console.log('modify the stream');
// Start the async processing of the response stream
modifyHtmlStream(response.body, writable, request, event);
// Return the in-process response so it can be streamed.
return newResponse;
}
/**
* Check to see if the HTML chunk includes a meta tag for an unsupported charset
* @param {*} chunk - Chunk of HTML to scan
* @returns {bool} - true if the HTML chunk includes a meta tag for an unsupported charset
*/
function chunkContainsInvalidCharset(chunk) {
let invalid = false;
// meta charset
const charsetRegex = /<\s*meta[^>]+charset\s*=\s*['"]([^'"]*)['"][^>]*>/mgi;
const charsetMatch = charsetRegex.exec(chunk);
if (charsetMatch) {
const docCharset = charsetMatch[1].toLowerCase();
if (!VALID_CHARSETS.includes(docCharset)) {
invalid = true;
}
}
// content-type
const contentTypeRegex = /<\s*meta[^>]+http-equiv\s*=\s*['"]\s*content-type[^>]*>/mgi;
const contentTypeMatch = contentTypeRegex.exec(chunk);
if (contentTypeMatch) {
const metaTag = contentTypeMatch[0];
const metaRegex = /charset\s*=\s*([^\s"]*)/mgi;
const metaMatch = metaRegex.exec(metaTag);
if (metaMatch) {
const charset = metaMatch[1].toLowerCase();
if (!VALID_CHARSETS.includes(charset)) {
invalid = true;
}
}
}
return invalid;
}
/**
* Process the streaming HTML response from the origin server.
*
* @param {*} readable - Input stream (from the origin).
* @param {*} writable - Output stream (to the browser).
* @param {*} request - Original request object for downstream use.
* @param {*} event - Worker event object
*/
async function modifyHtmlStream(readable, writable, request, event) {
const reader = readable.getReader();
const writer = writable.getWriter();
const encoder = new TextEncoder();
let decoder = new TextDecoder("utf-8", {fatal: true});
let firstChunk = true;
let unsupportedCharset = false;
let partial = '';
let content = '';
try {
for(;;) {
const { done, value } = await reader.read();
if (done) {
if (partial.length) {
// console.log('modify html chunk')
partial = await modifyHtmlChunk(partial, request, event);
await writer.write(encoder.encode(partial));
partial = '';
}
break;
}
let chunk = null;
if (unsupportedCharset) {
// console.log('unsupported charset')
// Pass the data straight through
await writer.write(value);
continue;
} else {
try {
chunk = decoder.decode(value, {stream:true});
} catch (e) {
// Decoding failed, switch to passthrough
unsupportedCharset = true;
if (partial.length) {
await writer.write(encoder.encode(partial));
partial = '';
}
await writer.write(value);
continue;
}
}
try {
// Look inside of the first chunk for a HTML charset or content-type meta tag.
if (firstChunk) {
firstChunk = false;
if (chunkContainsInvalidCharset(chunk)) {
// switch to passthrough
unsupportedCharset = true;
if (partial.length) {
await writer.write(encoder.encode(partial));
partial = '';
}
await writer.write(value);
continue;
}
}
// TODO: Optimize this so we aren't continuously adding strings together
content = partial + chunk;
partial = '';
// See if there is an unclosed a tag at the end (and if so, carve it out
// to complete when the remainder comes in).
// This isn't perfect (case sensitive and doesn't allow whitespace in the tag)
// but it is good enough for our purpose and much faster than a regex.
const linkPos = content.lastIndexOf('<a');
if (linkPos >= 0) {
const linkClose = content.indexOf('/>', linkPos);
if (linkClose === -1) {
partial = content.slice(linkPos);
content = content.slice(0, linkPos);
}
}
if (content.length) {
content = await modifyHtmlChunk(content, request, event);
}
} catch (e) {
// Ignore the exception
}
if (content.length) {
await writer.write(encoder.encode(content));
content = '';
}
}
} catch(e) {
// Ignore the exception
}
try {
await writer.close();
} catch(e) {
// Ignore the exception
}
}
/**
* Identify all <a> tags and add the "title" attribute to them
*
* @param {*} content - Text chunk from the streaming HTML (or accumulated head)
* @param {*} request - Original request object for downstream use.
* @param {*} event - Worker event object
*/
async function modifyHtmlChunk(content, request, event) {
// Fully tokenizing and parsing the HTML is expensive. This regex is much faster and should be reasonably safe.
// It looks for links and extracts the URL as match #1. It shouldn't match
// in-text content because the < > brackets would be escaped in the HTML.
const linkRegex = /<a\s+[^>]*href\s*=\s*['"]((https?:\/\/)?[^'"]+)[^>]*>/mgi;
let match = linkRegex.exec(content);
while (match !== null) {
// console.log('found a link');
const matchString = match[0];
const title = await fetchTitle(match[1], request, event);
// console.log(title);
if (title.length) {
// append title attribute
let newString = matchString.replace('>', ' title="' + title + '">')
content = content.split(matchString).join(newString);
linkRegex.lastIndex -= matchString.length - newString.length;
} else {
break;
}
match = linkRegex.exec(content);
}
return content;
}
/**
* Fetch the meta title from page
*
* @param {*} url - URL to fetch title from
* @param {*} request - Original request for the page HTML
* @param {*} event - Worker event object
*/
async function fetchTitle(url, request) {
requestUrl = new URL(request.url);
if (url.startsWith('/'))
url = requestUrl.protocol + '//' + requestUrl.hostname + url;
// console.log(url);
let title = await TITLES.get(url);
if (title !== null && title !== "") {
console.log('got title from cache')
return title;
}
title = "";
const userAgent = request.headers.get('user-agent');
const clientAddr = request.headers.get('cf-connecting-ip');
let headers = {'Referer': request.url};
if (userAgent) {
headers['User-Agent'] = userAgent;
} else {
headers['User-Agent'] = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
}
if (clientAddr) {
headers['X-Forwarded-For'] = clientAddr;
}
try {
const response = await fetch(url, {headers: headers});
const contentType = response.headers.get("content-type");
// console.log(response.status);
if (response && response.status === 200 && contentType && contentType.indexOf("text/html") !== -1) {
// console.log('response is successful')
text = await response.text();
titleRegex = /<title>([^<]*)<\/title>/mi;
// Get the title
let match = titleRegex.exec(text);
title = match[1].trim();
} else {
// console.log('something went wrong');
}
} catch(e) {
// Ignore the exception
// console.log('couldnt fetch the title')
return;
}
// Expire each title in 7 days automatically
await TITLES.put(url, title, {expirationTtl: 60 * 60 * 24 * 7});
return title;
}