nm3clol-express-app/gulpfile.js

const gulp = require('gulp');
const request = require('request-promise-native');
const axios = require('axios');
const glob = require('glob');
const fs = require('fs');
const path = require('path');
const crypto = require('crypto');
const url = require('url')
const { TikaClient } = require('./app/TikaClient/build');
const { Readable, Writable } = require('stream');

const relPathToFiles = './public';
const baseUrl = 'https://no-moss-3-carbo-landfill-library.online'; // URL of the document to download and index
const tikaUrl = 'http://solr.services.cleveland.daball.me:9998'; // URL of the Tika instance
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance
const solrVirginiaLawUrl = 'http://solr.services.cleveland.daball.me:8983/solr/va_code'; // URL of your Solr instance

// Task to clear out previous Solr data
gulp.task('index:clear', async () => {
    await request({
        uri: `${solrUrl}/update?commit=true`,
        method: 'POST',
        body: { delete: { query: '*:*' } }, // Delete all documents
        json: true,
    });
});

gulp.task('dbfromsolr', async () => {
    let docs = await request({
        uri: `${solrUrl}/select`,
        qs: {
            q: '*:*',
            wt: 'json',
            rows: 10000,
        },
        json: true
    });
    docs = docs?.response?.docs?.map(({id, sha256sum, url, content_length, content_type, text, _version_}) =>
    {
        return {
            id,
            url: url.join(''),
            content_length: parseInt(content_length.join()),
            sha256sum: sha256sum.join(''),
            content_type: content_type.join(''),
            text: text.join(''),
            _version_,
        }
    }).map(doc => {

    })
});

async function calculateSHA256Hash(filePath) {
    return new Promise((resolve, reject) => {
        const readStream = fs.createReadStream(filePath);
        const hash = crypto.createHash('sha256');

        readStream.on('data', (chunk) => {
            hash.update(chunk);
        });

        readStream.on('end', () => {
            const sha256Hash = hash.digest('hex');
            resolve(sha256Hash);
        });

        readStream.on('error', (error) => {
            reject(error);
        });
    });
}

// Function to retrieve metadata of a file from Solr
async function retrieveVirginiaLawMetadataFromSolr(url) {
    // Retrieve metadata from Solr based on the file URL or unique identifier
    // const response = await axios.get(`${solrUrl}/select?q=id:"${encodeURIComponent(url)}"&fl=${encodeURIComponent('sha256sum, content_length')}`, {
    //     responseType: 'json'
    // });
    const fl = encodeURIComponent("sha256sum, content_length");
    const q = encodeURIComponent("id:")+"\""+encodeURIComponent(url)+"\"";//encodeURIComponent(`id:"${url}"`);
    const uri = `${solrVirginiaLawUrl}/select?q=${q}&fl=${fl}`;
    const response = await request({ uri: `${uri}`, json: true });
    return response && response.response && response.response.docs && response.response.docs[0];
}

// Function to retrieve metadata of a file from Solr
async function retrieveMetadataFromSolr(url) {
    // Retrieve metadata from Solr based on the file URL or unique identifier
    // const response = await axios.get(`${solrUrl}/select?q=id:"${encodeURIComponent(url)}"&fl=${encodeURIComponent('sha256sum, content_length')}`, {
    //     responseType: 'json'
    // });
    const fl = encodeURIComponent("sha256sum, content_length");
    const q = encodeURIComponent("id:")+"\""+encodeURIComponent(url)+"\"";//encodeURIComponent(`id:"${url}"`);
    const uri = `${solrUrl}/select?q=${q}&fl=${fl}`;
    const response = await request({ uri: `${uri}`, json: true });
    return response && response.response && response.response.docs && response.response.docs[0];
}

async function indexDocumentInSolr(document) {
    try {
        // Send document to Solr using the Solr REST API or a Solr client library
        // Example code to send document using Axios:
        await axios.post(solrUrl + '/update/json/docs', document, {
            params: {
                commit: true, // Commit changes immediately
            },
        });
    } catch (error) {
        throw new Error('Error indexing document in Solr: ' + error.message);
    }
}

async function indexLawDocumentInSolr(document) {
    try {
        // Send document to Solr using the Solr REST API or a Solr client library
        // Example code to send document using Axios:
        await axios.post(solrVirginiaLawUrl + '/update/json/docs', document, {
            params: {
                commit: true, // Commit changes immediately
            },
        });
    } catch (error) {
        throw new Error('Error indexing document in Solr: ' + error.message);
    }
}

function extToMime(file_name) {
    switch (path.extname(file_name)) {
        case '.htm':
        case '.html':
            return 'text/html';
        case '.pdf':
            return 'application/pdf';
        case '.md':
        case '.txt':
        case '.mkv':
            return 'video/x-matroska';
        default:
            return 'text/plain';
    }
}


// Task to index files into Solr
gulp.task('index:laws', async () => {
    //let scanExts = ''; //set to empty string to scan all
    let scanExts = '.{pdf,docx,pptx,xlsx,jpg,png,txt}';
    let globs = [
        `Russell_County/Ordinances/**/*${scanExts}`,
        `Virginia_Law_Library/**/*${scanExts}`,
    ];
    // Use glob to match files in the local directories
    let files = [];
    let cwd = path.resolve(__dirname, relPathToFiles.replaceAll('/', path.sep));
    globs.forEach(async (globPattern) => {
        files = files.concat(glob.globSync(globPattern, {
            cwd,
            matchBase: true,
            follow: true,
        }));
    });
    console.log(`Found ${files.length} files to index using ${globs.length} glob patterns.`);
    // Loop through each file and process them
    for (let f = 0; f < files.length; f++) {
        const file = files[f];
        console.log(`${f+1}/${files.length}: ${file}`);

        const fileFullPath = path.join(cwd, file);

        const url = `https://no-moss-3-carbo-landfill-library.online/${file.replaceAll(path.sep, '/')}`;
        console.log('URL: ' + url);

        // Retrieve metadata of the file from Solr (if it exists)
        const metadata = await retrieveVirginiaLawMetadataFromSolr(url);

        // Calculate file size
        const stats = fs.statSync(fileFullPath);
        const fileSize = stats.size;

        // Calculate SHA256 checksum
        // const checksum = crypto.createHash('sha256').update(fileContents).digest('hex');
        const checksum = await calculateSHA256Hash(fileFullPath);

        // Compare metadata
        if (!metadata || parseInt(metadata.content_length[0]) != fileSize || metadata.sha256sum[0] != checksum) {
            // Metadata mismatch or file not found in Solr, proceed with indexing
            console.log(`Processing text from file using Tika.`);
            const client = new TikaClient({ host: tikaUrl });
            const version = await client.getVersion();
            console.info(`Tika Server Version: ${version}`);

            // Create a Readable stream for the file contents
            let f = fs.createReadStream(fileFullPath);

            // Create a writable stream to capture the extracted text content into a string
            let extractedText = '';
            const writableStream = new Writable({
              write(chunk, encoding, callback) {
                extractedText += chunk.toString(); // Append the chunk to the extracted text
                callback();
              }
            });

            // Use the TikaClient's pipe method to extract text content
            await client.pipe(f, writableStream, 'text/plain', encodeURI(path.basename(file)));
            console.log("Extracted Text:", extractedText);

            // Create Solr document
            const solrDocument = {
                id: url, // Replace with a unique identifier for the document
                text: extractedText, // Add the extracted text content
                sha256sum: checksum, // Add the checksum
                //html: response.data,
                url: url,
                content_length: fileSize,
                content_type: extToMime(url),
                // Add additional fields as needed (e.g., title, author, etc.)
            };

            // Send document to Solr for indexing
            // Index the file with its text content and metadata
            console.log(`Indexing ${url}`);
            await indexLawDocumentInSolr(solrDocument);

            // Continue
            console.log(`Done.`);
        } else {
            // Metadata matches, skip the file
            console.log(`Skipping file '${file}' as metadata matches existing metadata in Solr index.`);
        }
    }
});

// Task to index files into Solr
gulp.task('index:docs', async () => {
    //let scanExts = ''; //set to empty string to scan all
    let scanExts = '.{pdf,docx,pptx,xlsx,jpg,png,txt,mkv}';
    let globs = [
        `Amys_Drop_Box/**/*${scanExts}`,
        `CRS_Reports/**/*${scanExts}`,
        `Mine_Safety_and_Health_Administration/**/*${scanExts}`,
        `Potesta_&_Associates/**/*${scanExts}`,
        `Russell_County/**/*${scanExts}`,
        `Russell_County_Reclamation_LLC/**/*${scanExts}`,
        `Tobacco_Region_Revitalization_Commission/**/*${scanExts}`,
        `United_Mine_Workers_of_America/**/*${scanExts}`,
        `Virginia_Energy/**/*${scanExts}`,
        // I want to put Virginia Law in its own search category first.
        // `Virginia_Law_Library/**/*${scanExts}`,
    ];
    // Use glob to match files in the local directories
    let files = [];
    let cwd = path.resolve(__dirname, relPathToFiles.replaceAll('/', path.sep));
    globs.forEach(async (globPattern) => {
        files = files.concat(glob.globSync(globPattern, {
            cwd,
            matchBase: true,
            follow: true,
        }));
    });
    console.log(`Found ${files.length} files to index using ${globs.length} glob patterns.`);
    // Loop through each file and process them
    for (let f = 0; f < files.length; f++) {
        const file = files[f];
        console.log(`${f+1}/${files.length}: ${file}`);

        const fileFullPath = path.join(cwd, file);

        let url = `https://no-moss-3-carbo-landfill-library.online/${file.replaceAll(path.sep, '/')}`;
        console.log('URL: ' + url);

        // Retrieve metadata of the file from Solr (if it exists)
        const metadata = await retrieveMetadataFromSolr(url);

        // Calculate file size
        const stats = fs.statSync(fileFullPath);
        const fileSize = stats.size;

        // Calculate SHA256 checksum
        // const checksum = crypto.createHash('sha256').update(fileContents).digest('hex');
        const checksum = await calculateSHA256Hash(fileFullPath);

        // Compare metadata
        if (!metadata || parseInt(metadata.content_length[0]) != fileSize || metadata.sha256sum[0] != checksum) {
            // Metadata mismatch or file not found in Solr, proceed with indexing
            console.log(`Processing text from file using Tika.`);
            const client = new TikaClient({ host: tikaUrl });
            const version = await client.getVersion();
            console.info(`Tika Server Version: ${version}`);

            let extractedText = '';

            let subtitleExt = ".en.vtt";
            if (url.endsWith(".webm") || url.endsWith(".mkv") || url.endsWith(".mpg") || url.endsWith(".mpeg") || url.endsWith(".mp4")) {
                let subtitleFilePath = fileFullPath.substring(0, fileFullPath.lastIndexOf('.')) + subtitleExt;
                if (fs.existsSync(subtitleFilePath)) {
                    console.log("Found VTT subtitle file at:", subtitleFilePath);
                    extractedText = fs.readFileSync(subtitleFilePath, 'utf8');
                    url = url.substring(0, url.lastIndexOf('/')+1);
                }
                else {
                    console.log("No subtitles found at: ", subtitleFilePath);
                    console.log("Skipping this video file. Not adding this to the index until subtitles are available.")
                    continue;
                }
            }
            else {
                // Create a Readable stream for the file contents
                let f = fs.createReadStream(fileFullPath);
                // Create a writable stream to capture the extracted text content into a string
                const writableStream = new Writable({
                    write(chunk, encoding, callback) {
                        extractedText += chunk.toString(); // Append the chunk to the extracted text
                        callback();
                    }
                });
                // Use the TikaClient's pipe method to extract text content
                await client.pipe(f, writableStream, 'text/plain', encodeURI(path.basename(file)));
            }
            if (!extractedText) {
                console.log("Skipping document because no text was detected.");
                continue;
            }
            else if (extractedText.length < 100) {
                console.log("Extracted Text:", extractedText);
            }
            else {
                console.log("Extracted Text (excerpt):", extractedText.substring(0, 99));
            }

            // Create Solr document
            const solrDocument = {
                id: url, // Replace with a unique identifier for the document
                text: extractedText, // Add the extracted text content
                sha256sum: checksum, // Add the checksum
                //html: response.data,
                url: url,
                content_length: fileSize,
                content_type: extToMime(url),
                // Add additional fields as needed (e.g., title, author, etc.)
            };

            // Send document to Solr for indexing
            // Index the file with its text content and metadata
            console.log(`Indexing ${url}`);
            await indexDocumentInSolr(solrDocument);

            // Continue
            console.log(`Done.`);
        } else {
            // Metadata matches, skip the file
            console.log(`Skipping file '${file}' as metadata matches existing metadata in Solr index.`);
        }
    }
});

// Default task to run indexing
gulp.task('index', gulp.series('index:docs', 'index:laws'));

// Task to optionally run both clearing and indexing
gulp.task('index:reindex', gulp.series('index:clear', 'index'));

// Default task to run indexing
gulp.task('default', gulp.series('index'));