From b53256d5e1c3dba749d453ec27ffd2dac5ed8b40 Mon Sep 17 00:00:00 2001 From: sjkillen Date: Wed, 20 May 2026 19:43:39 -0600 Subject: [PATCH] Add index crawler --- www/crawler.js | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++ www/index.html | 1 + 2 files changed, 85 insertions(+) create mode 100644 www/crawler.js diff --git a/www/crawler.js b/www/crawler.js new file mode 100644 index 0000000..144a787 --- /dev/null +++ b/www/crawler.js @@ -0,0 +1,84 @@ +// Crawls the website to discover pages using PROPFIND + +import { url as root_url } from "./editor.js"; + +export const __files__ = Symbol("crawlerFiles"); + +function nodeTextToHrefs(nodes, exclude) { + const result = []; + for (const node of nodes) { + const href = new URL(node.textContent, root_url).href; + if (href != exclude) { + result.push(href); + } + } + return result; +} + +async function crawl(url) { + const response = await fetch(url, { + method: "PROPFIND", + headers: { Depth: "1" }, + }); + const text = await response.text(); + const parser = new DOMParser(); + const doc = parser.parseFromString(text, "text/xml"); + const collections = nodeTextToHrefs( + doc.querySelectorAll("response:has(resourcetype collection) href"), + url, + ); + const files = nodeTextToHrefs( + doc.querySelectorAll("response:not(:has(resourcetype collection)) href"), + url, + ); + return [collections, files]; +} + +async function crawlCollection(url) { + const result = {}; + const [subcollections, subfiles] = await crawl(url); + for (const subcollection of subcollections) { + result[subcollection] = await crawlCollection(subcollection); + } + result[__files__] = subfiles; + return result; +} + +async function recursiveCrawl(url) { + return { + [url]: await crawlCollection(url), + }; +} + +function formattedIndexDirectory(url, recursiveCrawlResult) { + const subcollections = Object.getOwnPropertyNames(recursiveCrawlResult); + const ul = document.createElement("ul"); + const a = document.createElement("a"); + a.href = url; + a.innerText = "/" + url.replace(root_url, ""); + ul.appendChild(a); + for (const subcollection of subcollections) { + const li = document.createElement("li"); + li.appendChild( + formattedIndexDirectory( + subcollection, + recursiveCrawlResult[subcollection], + ), + ); + ul.appendChild(li); + } + for (const file of recursiveCrawlResult[__files__]) { + const li = document.createElement("li"); + const a = document.createElement("a"); + li.appendChild(a); + ul.appendChild(li); + a.href = file; + a.innerText = "/" + file.replace(root_url, ""); + } + return ul; +} + +const rcr = await recursiveCrawl(root_url); +const root = Object.getOwnPropertyNames(rcr)[0]; +const formatted = formattedIndexDirectory(root, rcr[root]); +document.body.appendChild(formatted); diff --git a/www/index.html b/www/index.html index a6a38df..01234e3 100644 --- a/www/index.html +++ b/www/index.html @@ -5,6 +5,7 @@ +