nm3clol-express-app/one-time-conversion.mts

/**
 * Operation 1: Clear orphan index entries which have no file in public. Log output. Keep this gulp operation for later, call it 'index:clean-missing'.
 *
 * 1. Connect to Solr.
 * 2. Query * for all the documents. Return all the records and all the fields.
 * 3. For each of the records, check that the file exists. If the file does not exist on the filesystem, delete the index record.
 *
 * Operation 2: Save the stored records to the `${pathToFile}.md` file.
 *
 * 4. Check if the new file exists. If it exists, exit the procedure.
 * 5. If it does not exist, then create it.
 * 6. Write the contents to the file. Particularly, every file should have a YAML header. In that YAML header I want the following information stored:
 * ---
 * type: document
 * title: document title
 * mimeType: text/plain, application/vnd-word, etc.; use known good list
 * tags:
 *   - customized based on path, perhaps one tag per folder name
 * path: relative path to document file
 * sha256sum: checksum for SHA2-256 checksum
 * sha1sum: checksum for SHA1 checksum
 * ---
 * ...document plain text from the OCR operations.
 *
 * Operation 3: Save the Solr record without arrays into a second Solr instance.
 *
 * 7. Connect to second Solr instance.
 * 8. Store document correctly according to the new schema without additional cardinality of the search fields.
 * 9. Index all information in the .md (frontmatter and plaintext) schema into the Solr schema.
 *
 * Operation 4: Switch all code to use new Solr schema.
 *
 * 10. Update references to Solr so they use the new instance.
 *
 * Operation 5: Convert indexing code so that the Tika->.md file generation occurs in one operation and the .md->Solr scan occurs in another.
 *
 * 11. Run both operations to reindex the site.
 */

/** OPERATION 1: */

import axios from 'axios';
import { config } from './app/config.mjs';
import { SolrDocument, IncorrectStyleSolrDocument } from './app/search/solr-doc.mjs';
import request from 'request-promise-native';
import fs from 'fs';
import path from 'path';
import fm from 'gray-matter';

// Function to retrieve metadata of a file from Solr
async function getAllSolrDocuments(start: number = 0, rows: number = 10) {
    const res = await axios({ method: 'get', url: `${config.solrDocsUrl}/select?q=*:*&start=${start}&rows=${rows}`, responseType: 'json' })
    return res.data.response.docs;
}

async function removeSolrDocument(id: string) {
    const res = await axios({ method: 'post', url: `${config.solrDocsUrl}/update/json`, responseType: 'json', data: { commit: {}, delete: [ id ] } });
    return res.data;
}

const existingSolrDocuments: IncorrectStyleSolrDocument[] = (await getAllSolrDocuments(0, 65535));
const validSolrDocuments: StoredSolrDocument[] = [];
const stripLeftText = "https://no-moss-3-carbo-landfill-library.online/";

interface StoredSolrDocument extends SolrDocument {
    relativePathToFile: string;
    absolutePathToFile: string;
    doesExist: boolean;
};

for (const oldSolrDoc of existingSolrDocuments) {
    const solrDoc: StoredSolrDocument = {
        id: oldSolrDoc.id,
        content_length: oldSolrDoc.content_length && oldSolrDoc.content_length.length ? oldSolrDoc.content_length[0] : 0,
        content_type: oldSolrDoc.content_type && oldSolrDoc.content_type.length ? oldSolrDoc.content_type[0] : '',
        sha256sum: oldSolrDoc.sha256sum && oldSolrDoc.sha256sum.length ? oldSolrDoc.sha256sum[0] : '',
        text: oldSolrDoc.text && oldSolrDoc.text.length ? oldSolrDoc.text[0] : '',
        url: oldSolrDoc.url && oldSolrDoc.url.length ? oldSolrDoc.url[0] : '',
        _version_: oldSolrDoc._version_,
        relativePathToFile: '',
        absolutePathToFile: '',
        doesExist: false,
    };
    solrDoc.relativePathToFile = (solrDoc.url.startsWith(stripLeftText) ? solrDoc.url.substring(stripLeftText.length) : solrDoc.url).replaceAll('/', path.sep);
    solrDoc.absolutePathToFile = path.resolve(path.join(config.publicPath, solrDoc.relativePathToFile));
    solrDoc.doesExist = fs.existsSync(solrDoc.absolutePathToFile);
    // build the correct object
    if (!solrDoc.doesExist) {
        await removeSolrDocument(solrDoc.id).then((response) => {
            console.log(`Removed doc from search index because the file doesn't exist on the volume:`);
            console.log(`  id: ${solrDoc.id}`);
            console.log(`  url: ${solrDoc.url}`);
            console.log(`  path: ${solrDoc.absolutePathToFile}`);
            console.log(`  Response:`, response);
        })
    }
    else {
        validSolrDocuments.push(solrDoc);
    }
};

/** OPERATION 2: */
import moment from 'moment';
import { mkdir } from 'fs/promises';
function resolveTitleFromFileName(fileName: string) {
    return path.basename(fileName, path.extname(fileName)).replaceAll('_', ' ')
}

function resolveDateFromFileName(fileName: string) {
    // style 1: convert: 2024-06-19_00_25_00 to: 2024-06-19 12:25:00 am
    const m1 = 'YYYY-MM-DD hh:mm:ss a';
    const re1 = /^(\d{4})-(\d{2})-(\d{2})_(\d{2})_(\d{2})_(\d{2}).*/g;
    // style 2: convert: 2024-06-19_00_25 to: 2024-06-19 12:25:00 am
    const m2 = m1;
    const re2 = /^(\d{4})-(\d{2})-(\d{2})_(\d{2})_(\d{2}).*/g;
    // style 3: convert: 2024-06-19 to: 2024-06-19
    const re3 = /^(\d{4})-(\d{2})-(\d{2}).*/g;
    // style 4: convert: 2024-06 to: 2024-06
    const re4 = /^(\d{4})-(\d{2}).*/g;
    // style 5: convert: 2024 to: 2024
    const re5 = /^(\d{4}).*/g;
    if (fileName.search(re1) >= 0) {
        return moment(fileName.replace(re1, `$1-$2-$3 $4:$5:$6`)).format(m1);
    }
    else if (fileName.search(re2) >= 0) {
        return moment(fileName.replace(re2, `$1-$2-$3 $4:$5`)).format(m2);
    }
    else if (fileName.search(re3) >= 0) {
        return fileName.replace(re3, `$1-$2-$3`);
    }
    else if (fileName.search(re4) >= 0) {
        return fileName.replace(re4, `$1-$2`);
    }
    else if (fileName.search(re5) >= 0) {
        return fileName.replace(re5, `$1`);
    }
}

function resolveTagsFromRelativeFilePath(relFilePath: string) {
    let tags = relFilePath.split(path.sep);
    const fileName = tags.pop()!;
    const dateTag = resolveDateFromFileName(fileName!);
    if (dateTag) {
        // style 3: convert: 2024-06-19 to: 2024-06-19
        const re3 = /^(\d{4})-(\d{2})-(\d{2}).*/g;
        // style 4: convert: 2024-06 to: 2024-06
        const re4 = /^(\d{4})-(\d{2}).*/g;
        // style 5: convert: 2024 to: 2024
        const re5 = /^(\d{4}).*/g;
        if (fileName.search(re5) >= 0) {
            tags.push(fileName.replace(re5, `$1`));
        }
        if (fileName.search(re4) >= 0) {
            tags.push(fileName.replace(re4, `$1-$2`));
        }
        if (fileName.search(re3) >= 0) {
            tags.push(fileName.replace(re3, `$1-$2-$3`));
        }
    }
    return tags;
}

import crypto from 'crypto';
async function calculateSHA1Hash(filePath: string) {
    return new Promise<string>((resolve, reject) => {
        const readStream = fs.createReadStream(filePath);
        const hash = crypto.createHash('SHA1');

        readStream.on('data', (chunk) => {
            hash.update(chunk);
        });

        readStream.on('end', () => {
            const sha1Hash: string = hash.digest('hex');
            resolve(sha1Hash);
        });

        readStream.on('error', (error) => {
            reject(error);
        });
    });
}

for (const solrDoc of validSolrDocuments) {
    //guard: don't write this file for video files, all of which conveniently contain the string YouTube_Archive
    if (solrDoc.relativePathToFile.indexOf('YouTube_Archive') >= 0 || solrDoc.relativePathToFile.indexOf('2024-03-20-Governor_Youngkin_Listening_Session') >= 0) {
        console.log(`This Solr document is for a YouTube_Archive video, which uses a different metadata technique. Skipping creation of markdown file in folder.`);
        continue;
    }
    //guard: don't write this
    const mdFolderPath = path.join(path.dirname(solrDoc.absolutePathToFile), path.basename(solrDoc.absolutePathToFile, path.extname(solrDoc.absolutePathToFile)));
    if (fs.existsSync(mdFolderPath)) {
        console.log(`Directory at ${mdFolderPath} already exists. Continuing check for markdown file in folder.`);
    }
    else {
        //create folder
        await mkdir(mdFolderPath);
        console.log(`Directory at ${mdFolderPath} created.`);
    }
    const mdFilePath = path.join(mdFolderPath, 'README.md');
    // if (fs.existsSync(mdFilePath)) {
        // console.log(`Markdown file at ${mdFilePath} already exists. Skipping creation of markdown file in folder.`);
        // continue;
    // }
    // else {
        console.log(solrDoc.id);
        console.log(`Saving new markdown file at ${mdFilePath}:`);
        const mdFileContents = fm.stringify(solrDoc.text, {
            type: 'document',
            title: resolveTitleFromFileName(solrDoc.absolutePathToFile),
            file: path.posix.join('..', path.basename(solrDoc.absolutePathToFile)),
            tags: resolveTagsFromRelativeFilePath(solrDoc.relativePathToFile),
            docDate: resolveDateFromFileName(path.basename(solrDoc.absolutePathToFile))||null,
            contentType: solrDoc.content_type,
            contentLength: solrDoc.content_length,
            sha256sum: solrDoc.sha256sum,
            sha1sum: await calculateSHA1Hash(solrDoc.absolutePathToFile),
        });
        let ws = fs.createWriteStream(mdFilePath);
        ws.write(mdFileContents);
        ws.close();
    // }
}