nm3clol-express-app/index/gulpfile.js

const gulp = require('gulp');
const gulpif = require('gulp-if');
const through2 = require('through2');
//const shell = require('gulp-shell');
const axios = require('axios');
const fs = require('fs');
const os = require ("os");
const path = require('path');
const crypto = require('crypto');
const cheerio = require('cheerio');
const { TikaClient } = require('tika-js');

const baseUrl = 'https://no-moss-3-carbo-landfill-library.online'; // URL of the document to download and index
const tikaUrl = 'http://solr.services.cleveland.daball.me:9998'; // URL of the Tika instance
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance

function extToMime(file_name) {
    switch (path.extname(file_name)) {
        case '.htm':
        case '.html':
            return 'text/html';
        case '.pdf':
            return 'application/pdf';
        case '.md':
        case '.txt':
        default:
            return 'text/plain';
    }
}

function isFileExt(extname) {
    const fileExtnameMatchesExtname = function(file) {
        return file.extname == extname;
    };
    return file_extname_matches_extname;
}

async function tikaReadPdf(file) {
    const client = new TikaClient({ host: tikaUrl });
    const file_reader = fs.createReadStream(file);
    await client.pipe(file_reader, fs.createWriteStream('output.txt'), 'text/plain', tempFilePath);
}

function index() {
    src([
        "public/Russell_County_IDA/**.pdf",
        "public/Russell_County_IDA/**.pdf",
        //['.txt', '.html', '.htm', '.md', '.pdf']
    ])
    .pipe(gulpif(isFileExt(".pdf"), ))
}

async function extractAndIndexWithTika(url, solrUrl) {
    try {
        const tempFilePath = 'tempfile';
        await downloadFile(url, tempFilePath);
        const fileContent = fs.readFileSync(tempFilePath);
        const checksum = crypto.createHash('sha256').update(fileContent).digest('hex');

        // Query Solr to check if the file is already indexed
        const solrChecksumResponse = await axios.get(`${solrUrl}/select?q=id:"${url}"&fl=sha256sum`);
        const solrChecksum = solrChecksumResponse.data.response.docs[0]?.sha256sum;

        if (solrChecksum && solrChecksum === checksum) {
            console.log(`File ${filePath} hasn't changed. Skipping.`);
            return;
        }

        // Make request to Tika
        console.log(`Processing ${url}`);
//         const response = await axios.post(tikaUrl + '/form', formData, {
//             headers: {
// //                'Content-Type': 'application/octet-stream',
// 		        //'Content-Type': extToMime(url),
//                 //'Content-Length': fs.
//                 'Content-Type': 'multipart/form-data',
//                 'X-Tika-Output-Format': 'solr',
//                 //'X-Tika-SolrUrl': solrUrl
//             },
//             timeout: 40000000
//         });
        const client = new TikaClient({ host: tikaUrl });
        const version = await client.getVersion()
        console.info(`version: ${version}`)
        console.info(extToMime(url), await client.getContent(fs.createReadStream(tempFilePath), extToMime(url), path.basename(url)));
        await client.pipe(fs.createReadStream(tempFilePath), fs.createWriteStream('output.txt'), 'text/plain', tempFilePath);

        //console.log('Tika response:', fs.readFileSync('output.txt'));
        const fileData = fs.readFileSync('output.txt');
        //const contentLength = await fs.stat(tempFilePath).size;
        // Parse XML response from Tika

        const textContent = sanitizeIndexData(extractTextFromHtml(fileData));

        // Create Solr document
        const solrDocument = {
            id: url, // Replace with a unique identifier for the document
            text: textContent, // Add the extracted text content
            sha256sum: checksum, // Add the checksum
            //html: response.data,
            url: url,
            //content_length: contentLength,
            content_type: extToMime(url),
            // Add additional fields as needed (e.g., title, author, etc.)
        };

        // Send document to Solr for indexing
        console.log(`Indexing ${url}`);
        await indexDocumentInSolr(solrDocument);

        console.log('Document indexed successfully:', solrDocument.id);

    } catch (error) {
        console.error('Error extracting text with Tika:', error.message);
    }
}

function extractTextFromHtml(html) {
    // Parse HTML using Cheerio
    const $ = cheerio.load(html);

    // Extract text content from HTML
    const textContent = $('body').text().trim();

    return textContent;
}

async function indexDocumentInSolr(document) {
    try {
        // Send document to Solr using the Solr REST API or a Solr client library
        // Example code to send document using Axios:
        await axios.post(solrUrl + '/update/json/docs', document, {
            params: {
                commit: true, // Commit changes immediately
            },
        });
    } catch (error) {
        throw new Error('Error indexing document in Solr: ' + error.message);
    }
}

function sanitizeIndexData(data) {
    // Convert all whitespace characters to spaces
    let sanitizedData = data.replace(/\s+/g, ' ');

    // Remove double whitespace recursively
    while (sanitizedData !== (sanitizedData = sanitizedData.replace(/  /g, ' '))) {}

    return sanitizedData.trim(); // Trim leading and trailing spaces
}

async function clearSolrIndex() {
    try {
        // Send delete query to Solr to delete all documents
        const response = await axios.post(solrUrl + '/update', {
            delete: {
                query: '*:*'
            },
            commit: {}
        }, {
            headers: {
                'Content-Type': 'application/json'
            }
        });

        console.log('Deleted ' + response.data.responseHeader.status + ' documents');
    } catch (error) {
        console.error('Error clearing Solr index:', error.message);
    }
}

// Clears SOLR search index
const index_clear = async function() {
    await clearSolrIndex();
};
index_clear.displayName = 'index:clear';
gulp.task(index_clear);

// Reindexes SOLR search index with plaintext results from Tika
const index_index = async function() {
    for (let l = 0; l < filesToIndex.length; l++) {
        let line = filesToIndex[l];
        let documentUrl = baseUrl + line;
        await extractAndIndexWithTika(documentUrl, solrUrl);
    }
};
index_index.displayName = 'index:index';
gulp.task(index_index);

// Define index tasks
gulp.task('index:reindex', gulp.series('index:clear', 'index:index'));
gulp.task('index', gulp.series('index:index'));
// Define a default task (optional)
gulp.task('default', gulp.series('index'));