/** * Operation 1: Clear orphan index entries which have no file in public. Log output. Keep this gulp operation for later, call it 'index:clean-missing'. * * 1. Connect to Solr. * 2. Query * for all the documents. Return all the records and all the fields. * 3. For each of the records, check that the file exists. If the file does not exist on the filesystem, delete the index record. * * Operation 2: Save the stored records to the `${pathToFile}.md` file. * * 4. Check if the new file exists. If it exists, exit the procedure. * 5. If it does not exist, then create it. * 6. Write the contents to the file. Particularly, every file should have a YAML header. In that YAML header I want the following information stored: * --- * type: document * title: document title * mimeType: text/plain, application/vnd-word, etc.; use known good list * tags: * - customized based on path, perhaps one tag per folder name * path: relative path to document file * sha256sum: checksum for SHA2-256 checksum * sha1sum: checksum for SHA1 checksum * --- * ...document plain text from the OCR operations. * * Operation 3: Save the Solr record without arrays into a second Solr instance. * * 7. Connect to second Solr instance. * 8. Store document correctly according to the new schema without additional cardinality of the search fields. * 9. Index all information in the .md (frontmatter and plaintext) schema into the Solr schema. * * Operation 4: Switch all code to use new Solr schema. * * 10. Update references to Solr so they use the new instance. * * Operation 5: Convert indexing code so that the Tika->.md file generation occurs in one operation and the .md->Solr scan occurs in another. * * 11. Run both operations to reindex the site. */ /** OPERATION 1: */ import axios from 'axios'; import { config } from './app/config.mjs'; import { SolrDocument, IncorrectStyleSolrDocument } from './app/search/solr-doc.mjs'; import request from 'request-promise-native'; import fs from 'fs'; import path from 'path'; import fm from 'gray-matter'; // Function to retrieve metadata of a file from Solr async function getAllSolrDocuments(start: number = 0, rows: number = 10) { const res = await axios({ method: 'get', url: `${config.solrDocsUrl}/select?q=*:*&start=${start}&rows=${rows}`, responseType: 'json' }) return res.data.response.docs; } async function removeSolrDocument(id: string) { const res = await axios({ method: 'post', url: `${config.solrDocsUrl}/update/json`, responseType: 'json', data: { commit: {}, delete: [ id ] } }); return res.data; } const existingSolrDocuments: IncorrectStyleSolrDocument[] = (await getAllSolrDocuments(0, 65535)); const validSolrDocuments: StoredSolrDocument[] = []; const stripLeftText = "https://no-moss-3-carbo-landfill-library.online/"; interface StoredSolrDocument extends SolrDocument { relativePathToFile: string; absolutePathToFile: string; doesExist: boolean; }; for (const oldSolrDoc of existingSolrDocuments) { const solrDoc: StoredSolrDocument = { id: oldSolrDoc.id, content_length: oldSolrDoc.content_length && oldSolrDoc.content_length.length ? oldSolrDoc.content_length[0] : 0, content_type: oldSolrDoc.content_type && oldSolrDoc.content_type.length ? oldSolrDoc.content_type[0] : '', sha256sum: oldSolrDoc.sha256sum && oldSolrDoc.sha256sum.length ? oldSolrDoc.sha256sum[0] : '', text: oldSolrDoc.text && oldSolrDoc.text.length ? oldSolrDoc.text[0] : '', url: oldSolrDoc.url && oldSolrDoc.url.length ? oldSolrDoc.url[0] : '', _version_: oldSolrDoc._version_, relativePathToFile: '', absolutePathToFile: '', doesExist: false, }; solrDoc.relativePathToFile = (solrDoc.url.startsWith(stripLeftText) ? solrDoc.url.substring(stripLeftText.length) : solrDoc.url).replaceAll('/', path.sep); solrDoc.absolutePathToFile = path.resolve(path.join(config.publicPath, solrDoc.relativePathToFile)); solrDoc.doesExist = fs.existsSync(solrDoc.absolutePathToFile); // build the correct object if (!solrDoc.doesExist) { await removeSolrDocument(solrDoc.id).then((response) => { console.log(`Removed doc from search index because the file doesn't exist on the volume:`); console.log(` id: ${solrDoc.id}`); console.log(` url: ${solrDoc.url}`); console.log(` path: ${solrDoc.absolutePathToFile}`); console.log(` Response:`, response); }) } else { validSolrDocuments.push(solrDoc); } }; /** OPERATION 2: */ import moment from 'moment'; import { mkdir } from 'fs/promises'; function resolveTitleFromFileName(fileName: string) { return path.basename(fileName, path.extname(fileName)).replaceAll('_', ' ') } function resolveDateFromFileName(fileName: string) { // style 1: convert: 2024-06-19_00_25_00 to: 2024-06-19 12:25:00 am const m1 = 'YYYY-MM-DD hh:mm:ss a'; const re1 = /^(\d{4})-(\d{2})-(\d{2})_(\d{2})_(\d{2})_(\d{2}).*/g; // style 2: convert: 2024-06-19_00_25 to: 2024-06-19 12:25:00 am const m2 = m1; const re2 = /^(\d{4})-(\d{2})-(\d{2})_(\d{2})_(\d{2}).*/g; // style 3: convert: 2024-06-19 to: 2024-06-19 const re3 = /^(\d{4})-(\d{2})-(\d{2}).*/g; // style 4: convert: 2024-06 to: 2024-06 const re4 = /^(\d{4})-(\d{2}).*/g; // style 5: convert: 2024 to: 2024 const re5 = /^(\d{4}).*/g; if (fileName.search(re1) >= 0) { return moment(fileName.replace(re1, `$1-$2-$3 $4:$5:$6`)).format(m1); } else if (fileName.search(re2) >= 0) { return moment(fileName.replace(re2, `$1-$2-$3 $4:$5`)).format(m2); } else if (fileName.search(re3) >= 0) { return fileName.replace(re3, `$1-$2-$3`); } else if (fileName.search(re4) >= 0) { return fileName.replace(re4, `$1-$2`); } else if (fileName.search(re5) >= 0) { return fileName.replace(re5, `$1`); } } function resolveTagsFromRelativeFilePath(relFilePath: string) { let tags = relFilePath.split(path.sep); const fileName = tags.pop()!; const dateTag = resolveDateFromFileName(fileName!); if (dateTag) { // style 3: convert: 2024-06-19 to: 2024-06-19 const re3 = /^(\d{4})-(\d{2})-(\d{2}).*/g; // style 4: convert: 2024-06 to: 2024-06 const re4 = /^(\d{4})-(\d{2}).*/g; // style 5: convert: 2024 to: 2024 const re5 = /^(\d{4}).*/g; if (fileName.search(re5) >= 0) { tags.push(fileName.replace(re5, `$1`)); } if (fileName.search(re4) >= 0) { tags.push(fileName.replace(re4, `$1-$2`)); } if (fileName.search(re3) >= 0) { tags.push(fileName.replace(re3, `$1-$2-$3`)); } } return tags; } import crypto from 'crypto'; async function calculateSHA1Hash(filePath: string) { return new Promise((resolve, reject) => { const readStream = fs.createReadStream(filePath); const hash = crypto.createHash('SHA1'); readStream.on('data', (chunk) => { hash.update(chunk); }); readStream.on('end', () => { const sha1Hash: string = hash.digest('hex'); resolve(sha1Hash); }); readStream.on('error', (error) => { reject(error); }); }); } for (const solrDoc of validSolrDocuments) { //guard: don't write this file for video files, all of which conveniently contain the string YouTube_Archive if (solrDoc.relativePathToFile.indexOf('YouTube_Archive') >= 0 || solrDoc.relativePathToFile.indexOf('2024-03-20-Governor_Youngkin_Listening_Session') >= 0) { console.log(`This Solr document is for a YouTube_Archive video, which uses a different metadata technique. Skipping creation of markdown file in folder.`); continue; } //guard: don't write this const mdFolderPath = path.join(path.dirname(solrDoc.absolutePathToFile), path.basename(solrDoc.absolutePathToFile, path.extname(solrDoc.absolutePathToFile))); if (fs.existsSync(mdFolderPath)) { console.log(`Directory at ${mdFolderPath} already exists. Continuing check for markdown file in folder.`); } else { //create folder await mkdir(mdFolderPath); console.log(`Directory at ${mdFolderPath} created.`); } const mdFilePath = path.join(mdFolderPath, 'README.md'); // if (fs.existsSync(mdFilePath)) { // console.log(`Markdown file at ${mdFilePath} already exists. Skipping creation of markdown file in folder.`); // continue; // } // else { console.log(solrDoc.id); console.log(`Saving new markdown file at ${mdFilePath}:`); const mdFileContents = fm.stringify(solrDoc.text, { type: 'document', title: resolveTitleFromFileName(solrDoc.absolutePathToFile), file: path.posix.join('..', path.basename(solrDoc.absolutePathToFile)), tags: resolveTagsFromRelativeFilePath(solrDoc.relativePathToFile), docDate: resolveDateFromFileName(path.basename(solrDoc.absolutePathToFile))||null, contentType: solrDoc.content_type, contentLength: solrDoc.content_length, sha256sum: solrDoc.sha256sum, sha1sum: await calculateSHA1Hash(solrDoc.absolutePathToFile), }); let ws = fs.createWriteStream(mdFilePath); ws.write(mdFileContents); ws.close(); // } }