Add index crawler
This commit is contained in:
parent
3e655d56cf
commit
b53256d5e1
2 changed files with 85 additions and 0 deletions
84
www/crawler.js
Normal file
84
www/crawler.js
Normal file
|
|
@ -0,0 +1,84 @@
|
||||||
|
// Crawls the website to discover pages using PROPFIND
|
||||||
|
|
||||||
|
import { url as root_url } from "./editor.js";
|
||||||
|
|
||||||
|
export const __files__ = Symbol("crawlerFiles");
|
||||||
|
|
||||||
|
function nodeTextToHrefs(nodes, exclude) {
|
||||||
|
const result = [];
|
||||||
|
for (const node of nodes) {
|
||||||
|
const href = new URL(node.textContent, root_url).href;
|
||||||
|
if (href != exclude) {
|
||||||
|
result.push(href);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function crawl(url) {
|
||||||
|
const response = await fetch(url, {
|
||||||
|
method: "PROPFIND",
|
||||||
|
headers: { Depth: "1" },
|
||||||
|
});
|
||||||
|
const text = await response.text();
|
||||||
|
const parser = new DOMParser();
|
||||||
|
const doc = parser.parseFromString(text, "text/xml");
|
||||||
|
const collections = nodeTextToHrefs(
|
||||||
|
doc.querySelectorAll("response:has(resourcetype collection) href"),
|
||||||
|
url,
|
||||||
|
);
|
||||||
|
const files = nodeTextToHrefs(
|
||||||
|
doc.querySelectorAll("response:not(:has(resourcetype collection)) href"),
|
||||||
|
url,
|
||||||
|
);
|
||||||
|
return [collections, files];
|
||||||
|
}
|
||||||
|
|
||||||
|
async function crawlCollection(url) {
|
||||||
|
const result = {};
|
||||||
|
const [subcollections, subfiles] = await crawl(url);
|
||||||
|
for (const subcollection of subcollections) {
|
||||||
|
result[subcollection] = await crawlCollection(subcollection);
|
||||||
|
}
|
||||||
|
result[__files__] = subfiles;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function recursiveCrawl(url) {
|
||||||
|
return {
|
||||||
|
[url]: await crawlCollection(url),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function formattedIndexDirectory(url, recursiveCrawlResult) {
|
||||||
|
const subcollections = Object.getOwnPropertyNames(recursiveCrawlResult);
|
||||||
|
const ul = document.createElement("ul");
|
||||||
|
const a = document.createElement("a");
|
||||||
|
a.href = url;
|
||||||
|
a.innerText = "/" + url.replace(root_url, "");
|
||||||
|
ul.appendChild(a);
|
||||||
|
for (const subcollection of subcollections) {
|
||||||
|
const li = document.createElement("li");
|
||||||
|
li.appendChild(
|
||||||
|
formattedIndexDirectory(
|
||||||
|
subcollection,
|
||||||
|
recursiveCrawlResult[subcollection],
|
||||||
|
),
|
||||||
|
);
|
||||||
|
ul.appendChild(li);
|
||||||
|
}
|
||||||
|
for (const file of recursiveCrawlResult[__files__]) {
|
||||||
|
const li = document.createElement("li");
|
||||||
|
const a = document.createElement("a");
|
||||||
|
li.appendChild(a);
|
||||||
|
ul.appendChild(li);
|
||||||
|
a.href = file;
|
||||||
|
a.innerText = "/" + file.replace(root_url, "");
|
||||||
|
}
|
||||||
|
return ul;
|
||||||
|
}
|
||||||
|
|
||||||
|
const rcr = await recursiveCrawl(root_url);
|
||||||
|
const root = Object.getOwnPropertyNames(rcr)[0];
|
||||||
|
const formatted = formattedIndexDirectory(root, rcr[root]);
|
||||||
|
document.body.appendChild(formatted);
|
||||||
|
|
@ -5,6 +5,7 @@
|
||||||
<link rel="stylesheet" href="style.css" />
|
<link rel="stylesheet" href="style.css" />
|
||||||
<script type="module" src="editor.js" defer></script>
|
<script type="module" src="editor.js" defer></script>
|
||||||
<script type="module" src="tictactoe.js" defer></script>
|
<script type="module" src="tictactoe.js" defer></script>
|
||||||
|
<script type="module" src="crawler.js" defer></script>
|
||||||
<link rel="stylesheet" href="tictactoe.css" />
|
<link rel="stylesheet" href="tictactoe.css" />
|
||||||
</head>
|
</head>
|
||||||
<div id="banner">
|
<div id="banner">
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue