Add index crawler
This commit is contained in:
parent
3e655d56cf
commit
b53256d5e1
2 changed files with 85 additions and 0 deletions
84
www/crawler.js
Normal file
84
www/crawler.js
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
// Crawls the website to discover pages using PROPFIND
|
||||
|
||||
import { url as root_url } from "./editor.js";
|
||||
|
||||
export const __files__ = Symbol("crawlerFiles");
|
||||
|
||||
function nodeTextToHrefs(nodes, exclude) {
|
||||
const result = [];
|
||||
for (const node of nodes) {
|
||||
const href = new URL(node.textContent, root_url).href;
|
||||
if (href != exclude) {
|
||||
result.push(href);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
async function crawl(url) {
|
||||
const response = await fetch(url, {
|
||||
method: "PROPFIND",
|
||||
headers: { Depth: "1" },
|
||||
});
|
||||
const text = await response.text();
|
||||
const parser = new DOMParser();
|
||||
const doc = parser.parseFromString(text, "text/xml");
|
||||
const collections = nodeTextToHrefs(
|
||||
doc.querySelectorAll("response:has(resourcetype collection) href"),
|
||||
url,
|
||||
);
|
||||
const files = nodeTextToHrefs(
|
||||
doc.querySelectorAll("response:not(:has(resourcetype collection)) href"),
|
||||
url,
|
||||
);
|
||||
return [collections, files];
|
||||
}
|
||||
|
||||
async function crawlCollection(url) {
|
||||
const result = {};
|
||||
const [subcollections, subfiles] = await crawl(url);
|
||||
for (const subcollection of subcollections) {
|
||||
result[subcollection] = await crawlCollection(subcollection);
|
||||
}
|
||||
result[__files__] = subfiles;
|
||||
return result;
|
||||
}
|
||||
|
||||
async function recursiveCrawl(url) {
|
||||
return {
|
||||
[url]: await crawlCollection(url),
|
||||
};
|
||||
}
|
||||
|
||||
function formattedIndexDirectory(url, recursiveCrawlResult) {
|
||||
const subcollections = Object.getOwnPropertyNames(recursiveCrawlResult);
|
||||
const ul = document.createElement("ul");
|
||||
const a = document.createElement("a");
|
||||
a.href = url;
|
||||
a.innerText = "/" + url.replace(root_url, "");
|
||||
ul.appendChild(a);
|
||||
for (const subcollection of subcollections) {
|
||||
const li = document.createElement("li");
|
||||
li.appendChild(
|
||||
formattedIndexDirectory(
|
||||
subcollection,
|
||||
recursiveCrawlResult[subcollection],
|
||||
),
|
||||
);
|
||||
ul.appendChild(li);
|
||||
}
|
||||
for (const file of recursiveCrawlResult[__files__]) {
|
||||
const li = document.createElement("li");
|
||||
const a = document.createElement("a");
|
||||
li.appendChild(a);
|
||||
ul.appendChild(li);
|
||||
a.href = file;
|
||||
a.innerText = "/" + file.replace(root_url, "");
|
||||
}
|
||||
return ul;
|
||||
}
|
||||
|
||||
const rcr = await recursiveCrawl(root_url);
|
||||
const root = Object.getOwnPropertyNames(rcr)[0];
|
||||
const formatted = formattedIndexDirectory(root, rcr[root]);
|
||||
document.body.appendChild(formatted);
|
||||
|
|
@ -5,6 +5,7 @@
|
|||
<link rel="stylesheet" href="style.css" />
|
||||
<script type="module" src="editor.js" defer></script>
|
||||
<script type="module" src="tictactoe.js" defer></script>
|
||||
<script type="module" src="crawler.js" defer></script>
|
||||
<link rel="stylesheet" href="tictactoe.css" />
|
||||
</head>
|
||||
<div id="banner">
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue