nm3clol-express-app/one-time-conversion.mts

224 lines
9.3 KiB
TypeScript

/**
* Operation 1: Clear orphan index entries which have no file in public. Log output. Keep this gulp operation for later, call it 'index:clean-missing'.
*
* 1. Connect to Solr.
* 2. Query * for all the documents. Return all the records and all the fields.
* 3. For each of the records, check that the file exists. If the file does not exist on the filesystem, delete the index record.
*
* Operation 2: Save the stored records to the `${pathToFile}.md` file.
*
* 4. Check if the new file exists. If it exists, exit the procedure.
* 5. If it does not exist, then create it.
* 6. Write the contents to the file. Particularly, every file should have a YAML header. In that YAML header I want the following information stored:
* ---
* type: document
* title: document title
* mimeType: text/plain, application/vnd-word, etc.; use known good list
* tags:
* - customized based on path, perhaps one tag per folder name
* path: relative path to document file
* sha256sum: checksum for SHA2-256 checksum
* sha1sum: checksum for SHA1 checksum
* ---
* ...document plain text from the OCR operations.
*
* Operation 3: Save the Solr record without arrays into a second Solr instance.
*
* 7. Connect to second Solr instance.
* 8. Store document correctly according to the new schema without additional cardinality of the search fields.
* 9. Index all information in the .md (frontmatter and plaintext) schema into the Solr schema.
*
* Operation 4: Switch all code to use new Solr schema.
*
* 10. Update references to Solr so they use the new instance.
*
* Operation 5: Convert indexing code so that the Tika->.md file generation occurs in one operation and the .md->Solr scan occurs in another.
*
* 11. Run both operations to reindex the site.
*/
/** OPERATION 1: */
import axios from 'axios';
import { config } from './app/config.mjs';
import { SolrDocument, IncorrectStyleSolrDocument } from './app/search/solr-doc.mjs';
import request from 'request-promise-native';
import fs from 'fs';
import path from 'path';
import fm from 'gray-matter';
// Function to retrieve metadata of a file from Solr
async function getAllSolrDocuments(start: number = 0, rows: number = 10) {
const res = await axios({ method: 'get', url: `${config.solrDocsUrl}/select?q=*:*&start=${start}&rows=${rows}`, responseType: 'json' })
return res.data.response.docs;
}
async function removeSolrDocument(id: string) {
const res = await axios({ method: 'post', url: `${config.solrDocsUrl}/update/json`, responseType: 'json', data: { commit: {}, delete: [ id ] } });
return res.data;
}
const existingSolrDocuments: IncorrectStyleSolrDocument[] = (await getAllSolrDocuments(0, 65535));
const validSolrDocuments: StoredSolrDocument[] = [];
const stripLeftText = "https://no-moss-3-carbo-landfill-library.online/";
interface StoredSolrDocument extends SolrDocument {
relativePathToFile: string;
absolutePathToFile: string;
doesExist: boolean;
};
for (const oldSolrDoc of existingSolrDocuments) {
const solrDoc: StoredSolrDocument = {
id: oldSolrDoc.id,
content_length: oldSolrDoc.content_length && oldSolrDoc.content_length.length ? oldSolrDoc.content_length[0] : 0,
content_type: oldSolrDoc.content_type && oldSolrDoc.content_type.length ? oldSolrDoc.content_type[0] : '',
sha256sum: oldSolrDoc.sha256sum && oldSolrDoc.sha256sum.length ? oldSolrDoc.sha256sum[0] : '',
text: oldSolrDoc.text && oldSolrDoc.text.length ? oldSolrDoc.text[0] : '',
url: oldSolrDoc.url && oldSolrDoc.url.length ? oldSolrDoc.url[0] : '',
_version_: oldSolrDoc._version_,
relativePathToFile: '',
absolutePathToFile: '',
doesExist: false,
};
solrDoc.relativePathToFile = (solrDoc.url.startsWith(stripLeftText) ? solrDoc.url.substring(stripLeftText.length) : solrDoc.url).replaceAll('/', path.sep);
solrDoc.absolutePathToFile = path.resolve(path.join(config.publicPath, solrDoc.relativePathToFile));
solrDoc.doesExist = fs.existsSync(solrDoc.absolutePathToFile);
// build the correct object
if (!solrDoc.doesExist) {
await removeSolrDocument(solrDoc.id).then((response) => {
console.log(`Removed doc from search index because the file doesn't exist on the volume:`);
console.log(` id: ${solrDoc.id}`);
console.log(` url: ${solrDoc.url}`);
console.log(` path: ${solrDoc.absolutePathToFile}`);
console.log(` Response:`, response);
})
}
else {
validSolrDocuments.push(solrDoc);
}
};
/** OPERATION 2: */
import moment from 'moment';
import { mkdir } from 'fs/promises';
function resolveTitleFromFileName(fileName: string) {
return path.basename(fileName, path.extname(fileName)).replaceAll('_', ' ')
}
function resolveDateFromFileName(fileName: string) {
// style 1: convert: 2024-06-19_00_25_00 to: 2024-06-19 12:25:00 am
const m1 = 'YYYY-MM-DD hh:mm:ss a';
const re1 = /^(\d{4})-(\d{2})-(\d{2})_(\d{2})_(\d{2})_(\d{2}).*/g;
// style 2: convert: 2024-06-19_00_25 to: 2024-06-19 12:25:00 am
const m2 = m1;
const re2 = /^(\d{4})-(\d{2})-(\d{2})_(\d{2})_(\d{2}).*/g;
// style 3: convert: 2024-06-19 to: 2024-06-19
const re3 = /^(\d{4})-(\d{2})-(\d{2}).*/g;
// style 4: convert: 2024-06 to: 2024-06
const re4 = /^(\d{4})-(\d{2}).*/g;
// style 5: convert: 2024 to: 2024
const re5 = /^(\d{4}).*/g;
if (fileName.search(re1) >= 0) {
return moment(fileName.replace(re1, `$1-$2-$3 $4:$5:$6`)).format(m1);
}
else if (fileName.search(re2) >= 0) {
return moment(fileName.replace(re2, `$1-$2-$3 $4:$5`)).format(m2);
}
else if (fileName.search(re3) >= 0) {
return fileName.replace(re3, `$1-$2-$3`);
}
else if (fileName.search(re4) >= 0) {
return fileName.replace(re4, `$1-$2`);
}
else if (fileName.search(re5) >= 0) {
return fileName.replace(re5, `$1`);
}
}
function resolveTagsFromRelativeFilePath(relFilePath: string) {
let tags = relFilePath.split(path.sep);
const fileName = tags.pop()!;
const dateTag = resolveDateFromFileName(fileName!);
if (dateTag) {
// style 3: convert: 2024-06-19 to: 2024-06-19
const re3 = /^(\d{4})-(\d{2})-(\d{2}).*/g;
// style 4: convert: 2024-06 to: 2024-06
const re4 = /^(\d{4})-(\d{2}).*/g;
// style 5: convert: 2024 to: 2024
const re5 = /^(\d{4}).*/g;
if (fileName.search(re5) >= 0) {
tags.push(fileName.replace(re5, `$1`));
}
if (fileName.search(re4) >= 0) {
tags.push(fileName.replace(re4, `$1-$2`));
}
if (fileName.search(re3) >= 0) {
tags.push(fileName.replace(re3, `$1-$2-$3`));
}
}
return tags;
}
import crypto from 'crypto';
async function calculateSHA1Hash(filePath: string) {
return new Promise<string>((resolve, reject) => {
const readStream = fs.createReadStream(filePath);
const hash = crypto.createHash('SHA1');
readStream.on('data', (chunk) => {
hash.update(chunk);
});
readStream.on('end', () => {
const sha1Hash: string = hash.digest('hex');
resolve(sha1Hash);
});
readStream.on('error', (error) => {
reject(error);
});
});
}
for (const solrDoc of validSolrDocuments) {
//guard: don't write this file for video files, all of which conveniently contain the string YouTube_Archive
if (solrDoc.relativePathToFile.indexOf('YouTube_Archive') >= 0 || solrDoc.relativePathToFile.indexOf('2024-03-20-Governor_Youngkin_Listening_Session') >= 0) {
console.log(`This Solr document is for a YouTube_Archive video, which uses a different metadata technique. Skipping creation of markdown file in folder.`);
continue;
}
//guard: don't write this
const mdFolderPath = path.join(path.dirname(solrDoc.absolutePathToFile), path.basename(solrDoc.absolutePathToFile, path.extname(solrDoc.absolutePathToFile)));
if (fs.existsSync(mdFolderPath)) {
console.log(`Directory at ${mdFolderPath} already exists. Continuing check for markdown file in folder.`);
}
else {
//create folder
await mkdir(mdFolderPath);
console.log(`Directory at ${mdFolderPath} created.`);
}
const mdFilePath = path.join(mdFolderPath, 'README.md');
// if (fs.existsSync(mdFilePath)) {
// console.log(`Markdown file at ${mdFilePath} already exists. Skipping creation of markdown file in folder.`);
// continue;
// }
// else {
console.log(solrDoc.id);
console.log(`Saving new markdown file at ${mdFilePath}:`);
const mdFileContents = fm.stringify(solrDoc.text, {
type: 'document',
title: resolveTitleFromFileName(solrDoc.absolutePathToFile),
file: path.posix.join('..', path.basename(solrDoc.absolutePathToFile)),
tags: resolveTagsFromRelativeFilePath(solrDoc.relativePathToFile),
docDate: resolveDateFromFileName(path.basename(solrDoc.absolutePathToFile))||null,
contentType: solrDoc.content_type,
contentLength: solrDoc.content_length,
sha256sum: solrDoc.sha256sum,
sha1sum: await calculateSHA1Hash(solrDoc.absolutePathToFile),
});
let ws = fs.createWriteStream(mdFilePath);
ws.write(mdFileContents);
ws.close();
// }
}