Add index crawler

This commit is contained in:
Spencer Killen 2026-05-20 19:43:39 -06:00
parent 3e655d56cf
commit b53256d5e1
Signed by: sjkillen
GPG key ID: 1DAA9D8D7C6ADD05
2 changed files with 85 additions and 0 deletions

84
www/crawler.js Normal file
View file

@ -0,0 +1,84 @@
// Crawls the website to discover pages using PROPFIND
import { url as root_url } from "./editor.js";
export const __files__ = Symbol("crawlerFiles");
function nodeTextToHrefs(nodes, exclude) {
const result = [];
for (const node of nodes) {
const href = new URL(node.textContent, root_url).href;
if (href != exclude) {
result.push(href);
}
}
return result;
}
async function crawl(url) {
const response = await fetch(url, {
method: "PROPFIND",
headers: { Depth: "1" },
});
const text = await response.text();
const parser = new DOMParser();
const doc = parser.parseFromString(text, "text/xml");
const collections = nodeTextToHrefs(
doc.querySelectorAll("response:has(resourcetype collection) href"),
url,
);
const files = nodeTextToHrefs(
doc.querySelectorAll("response:not(:has(resourcetype collection)) href"),
url,
);
return [collections, files];
}
async function crawlCollection(url) {
const result = {};
const [subcollections, subfiles] = await crawl(url);
for (const subcollection of subcollections) {
result[subcollection] = await crawlCollection(subcollection);
}
result[__files__] = subfiles;
return result;
}
async function recursiveCrawl(url) {
return {
[url]: await crawlCollection(url),
};
}
function formattedIndexDirectory(url, recursiveCrawlResult) {
const subcollections = Object.getOwnPropertyNames(recursiveCrawlResult);
const ul = document.createElement("ul");
const a = document.createElement("a");
a.href = url;
a.innerText = "/" + url.replace(root_url, "");
ul.appendChild(a);
for (const subcollection of subcollections) {
const li = document.createElement("li");
li.appendChild(
formattedIndexDirectory(
subcollection,
recursiveCrawlResult[subcollection],
),
);
ul.appendChild(li);
}
for (const file of recursiveCrawlResult[__files__]) {
const li = document.createElement("li");
const a = document.createElement("a");
li.appendChild(a);
ul.appendChild(li);
a.href = file;
a.innerText = "/" + file.replace(root_url, "");
}
return ul;
}
const rcr = await recursiveCrawl(root_url);
const root = Object.getOwnPropertyNames(rcr)[0];
const formatted = formattedIndexDirectory(root, rcr[root]);
document.body.appendChild(formatted);

View file

@ -5,6 +5,7 @@
<link rel="stylesheet" href="style.css" /> <link rel="stylesheet" href="style.css" />
<script type="module" src="editor.js" defer></script> <script type="module" src="editor.js" defer></script>
<script type="module" src="tictactoe.js" defer></script> <script type="module" src="tictactoe.js" defer></script>
<script type="module" src="crawler.js" defer></script>
<link rel="stylesheet" href="tictactoe.css" /> <link rel="stylesheet" href="tictactoe.css" />
</head> </head>
<div id="banner"> <div id="banner">