Returns 522 all the time

I have a script to crawl all links in the page and populate links’ titles with title tag content from the pages. Although it seemed to be running fine when i tested it (I had to comment out lines where I work with KV though), after deployment all I get is 522 after some time. I have A DNS record for my subdomain which points to 192.0.2.1 and proxy status is “Proxied”, I also have CNAME for www record which points to the subdomain… I set up router, created KV, added KV namespace binding for it…

    /**
 * Main worker entry point
 */
addEventListener("fetch", event => {
  // Fail-safe in case of an unhandled exception
  console.log('hey');
  event.passThroughOnException();
  
  const url = new URL(event.request.url);
  if (event.request.method === 'GET') {
    event.respondWith(handleRequest(event.request, event));
  }
});

// Workers can only decode utf-8 so keep a list of character encodings that can be decoded.
const VALID_CHARSETS = ['utf-8', 'utf8', 'iso-8859-1', 'us-ascii'];

/**
 * Handle all non-proxied requests. Send HTML for further processing
 * and pass everything else through unmodified.
 * @param {*} request - Original request
 * @param {*} event - Original worker event
 */
async function handleRequest(request, event) {
  // Try to get cache
  let cache = caches.default;
  let response = await cache.match(event.request);
  if (response) {
    return response;
  }

  response = await fetch(request);
  if (response && response.status === 200) {
    const contentType = response.headers.get("content-type");
    if (contentType && contentType.indexOf("text/html") !== -1) {
      response = await processHtmlResponse(response, event.request, event);
      event.waitUntil(cache.put(event.request, response.clone()));
      return response;
    }
  }

  return response;
}

/**
 * Handle all of the processing for a (likely) HTML request.
 * - Pass through the request to the origin and inspect the response.
 * - If the response is HTML set up a streaming transform and pass it on to modifyHtmlStream for processing
 * 
 * Extra care needs to be taken to make sure the character encoding from the original
 * HTML is extracted and converted to utf-8 and that the downstream response is identified
 * as utf-8.
 * 
 * @param {*} response The original response
 * @param {*} request The original request
 * @param {*} event worker event object
 */
async function processHtmlResponse(response, request, event) {
  // Workers can only decode utf-8. If it is anything else, pass the
  // response through unmodified
  const contentType = response.headers.get("content-type");
  const charsetRegex = /charset\s*=\s*([^\s;]+)/mgi;
  const match = charsetRegex.exec(contentType);
  if (match !== null) {
    let charset = match[1].toLowerCase();
    if (!VALID_CHARSETS.includes(charset)) {
      return response;
    }
  }
  
  // Create an identity TransformStream (a.k.a. a pipe).
  // The readable side will become our new response body.
  const { readable, writable } = new TransformStream();

  // Create a cloned response with our modified stream
  const newResponse = new Response(readable, response);
  // console.log('modify the stream');
  // Start the async processing of the response stream
  modifyHtmlStream(response.body, writable, request, event);

  // Return the in-process response so it can be streamed.
  return newResponse;
}

/**
 * Check to see if the HTML chunk includes a meta tag for an unsupported charset
 * @param {*} chunk - Chunk of HTML to scan
 * @returns {bool} - true if the HTML chunk includes a meta tag for an unsupported charset
 */
function chunkContainsInvalidCharset(chunk) {
  let invalid = false;

  // meta charset
  const charsetRegex = /<\s*meta[^>]+charset\s*=\s*['"]([^'"]*)['"][^>]*>/mgi;
  const charsetMatch = charsetRegex.exec(chunk);
  if (charsetMatch) {
    const docCharset = charsetMatch[1].toLowerCase();
    if (!VALID_CHARSETS.includes(docCharset)) {
      invalid = true;
    }
  }
  // content-type
  const contentTypeRegex = /<\s*meta[^>]+http-equiv\s*=\s*['"]\s*content-type[^>]*>/mgi;
  const contentTypeMatch = contentTypeRegex.exec(chunk);
  if (contentTypeMatch) {
    const metaTag = contentTypeMatch[0];
    const metaRegex = /charset\s*=\s*([^\s"]*)/mgi;
    const metaMatch = metaRegex.exec(metaTag);
    if (metaMatch) {
      const charset = metaMatch[1].toLowerCase();
      if (!VALID_CHARSETS.includes(charset)) {
        invalid = true;
      }
    }
  }
  return invalid;
}

/**
 * Process the streaming HTML response from the origin server.
 * 
 * @param {*} readable - Input stream (from the origin).
 * @param {*} writable - Output stream (to the browser).
 * @param {*} request - Original request object for downstream use.
 * @param {*} event - Worker event object
 */
async function modifyHtmlStream(readable, writable, request, event) {
  const reader = readable.getReader();
  const writer = writable.getWriter();
  const encoder = new TextEncoder();
  let decoder = new TextDecoder("utf-8", {fatal: true});

  let firstChunk = true;
  let unsupportedCharset = false;

  let partial = '';
  let content = '';

  try {
    for(;;) {
      const { done, value } = await reader.read();
      if (done) {
        if (partial.length) {
          // console.log('modify html chunk')
          partial = await modifyHtmlChunk(partial, request, event);
          await writer.write(encoder.encode(partial));
          partial = '';
        }
        break;
      }

      let chunk = null;
      if (unsupportedCharset) {
        // console.log('unsupported charset')
        // Pass the data straight through
        await writer.write(value);
        continue;
      } else {
        try {
          chunk = decoder.decode(value, {stream:true});
        } catch (e) {
          // Decoding failed, switch to passthrough
          unsupportedCharset = true;
          if (partial.length) {
            await writer.write(encoder.encode(partial));
            partial = '';
          }
          await writer.write(value);
          continue;
        }
      }

      try {
        // Look inside of the first chunk for a HTML charset or content-type meta tag.
        if (firstChunk) {
          firstChunk = false;
          if (chunkContainsInvalidCharset(chunk)) {
            // switch to passthrough
            unsupportedCharset = true;
            if (partial.length) {
              await writer.write(encoder.encode(partial));
              partial = '';
            }
            await writer.write(value);
            continue;
          }
        }

        // TODO: Optimize this so we aren't continuously adding strings together
        content = partial + chunk;
        partial = '';

        // See if there is an unclosed a tag at the end (and if so, carve it out
        // to complete when the remainder comes in).
        // This isn't perfect (case sensitive and doesn't allow whitespace in the tag)
        // but it is good enough for our purpose and much faster than a regex.
        const linkPos = content.lastIndexOf('<a');
        if (linkPos >= 0) {
          const linkClose = content.indexOf('/>', linkPos);
          if (linkClose === -1) {
            partial = content.slice(linkPos);
            content = content.slice(0, linkPos);
          }
        }

        if (content.length) {
          content = await modifyHtmlChunk(content, request, event);
        }
      } catch (e) {
        // Ignore the exception
      }
      if (content.length) {
        await writer.write(encoder.encode(content));
        content = '';
      }
    }
  } catch(e) {
    // Ignore the exception
  }

  try {
    await writer.close();
  } catch(e) {
    // Ignore the exception
  }
}

/**
 * Identify all <a> tags and add the "title" attribute to them
 * 
 * @param {*} content - Text chunk from the streaming HTML (or accumulated head)
 * @param {*} request - Original request object for downstream use.
 * @param {*} event - Worker event object
*/
async function modifyHtmlChunk(content, request, event) {
  // Fully tokenizing and parsing the HTML is expensive.  This regex is much faster and should be reasonably safe.
  // It looks for links and extracts the URL as match #1.  It shouldn't match
  // in-text content because the < > brackets would be escaped in the HTML.
  const linkRegex = /<a\s+[^>]*href\s*=\s*['"]((https?:\/\/)?[^'"]+)[^>]*>/mgi;
  let match = linkRegex.exec(content);
  while (match !== null) {
    // console.log('found a link');
    const matchString = match[0];
    const title = await fetchTitle(match[1], request, event);
    // console.log(title);
    if (title.length) {
      // append title attribute
      let newString = matchString.replace('>', ' title="' + title + '">')
      content = content.split(matchString).join(newString);
      linkRegex.lastIndex -= matchString.length - newString.length;
    } else {
      break;
    }
    match = linkRegex.exec(content);
  }

  return content;
}

/**
 * Fetch the meta title from page
 * 
 * @param {*} url - URL to fetch title from
 * @param {*} request - Original request for the page HTML
 * @param {*} event - Worker event object
 */
async function fetchTitle(url, request) {
  requestUrl = new URL(request.url);
  if (url.startsWith('/'))
    url = requestUrl.protocol + '//' + requestUrl.hostname + url;

  // console.log(url);

  let title = await TITLES.get(url);
  if (title !== null && title !== "") {
    console.log('got title from cache')
    return title;
  }

  title = "";

  const userAgent = request.headers.get('user-agent');
  const clientAddr = request.headers.get('cf-connecting-ip');

  let headers = {'Referer': request.url};
  if (userAgent) {
    headers['User-Agent'] = userAgent;
  } else {
    headers['User-Agent'] = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
  }
  if (clientAddr) {
    headers['X-Forwarded-For'] = clientAddr;
  }

    try {
      const response = await fetch(url, {headers: headers});
      const contentType = response.headers.get("content-type");

      // console.log(response.status);
    
      if (response && response.status === 200 && contentType && contentType.indexOf("text/html") !== -1) {
        // console.log('response is successful')
        text = await response.text();
        titleRegex = /<title>([^<]*)<\/title>/mi;
        // Get the title
        let match = titleRegex.exec(text);
        title = match[1].trim();
      } else {
        // console.log('something went wrong');
      }
    } catch(e) {
      // Ignore the exception
      // console.log('couldnt fetch the title')
      return;
    }

  // Expire each title in 7 days automatically
  await TITLES.put(url, title, {expirationTtl: 60 * 60 * 24 * 7});
  return title;
}
1 Like

Just encountered this. You’ve got an error in your code. Mine was that I wasn’t using the event.respondWith(). Not sure why yours may not be working, but it was definitely not a CF issue in my case.

1 Like

Thanks for your response. Actually, my script has to make calls to the website pages, so this thing with IP address 192.0.2.1 won’t work in my case. Unfortunately, if I mention my server’s actual IP address and try to load the page, I get ERR_SSL_VERSION_OR_CIPHER_MISMATCH.