nm3clol-express-app/index/gulpfile backup copy.js

const gulp = require('gulp');
//const shell = require('gulp-shell');
const axios = require('axios');
const fs = require('fs');
const os = require ("os");
const path = require('path');
const crypto = require('crypto');
const cheerio = require('cheerio');
const SftpClient = require('ssh2-sftp-client');
const { TikaClient } = require('tika-js');

const baseUrl = 'https://russell-county-archives.daball.me'; // URL of the document to download and index
const sftpBasePath = 'david@caddy.services.cleveland.daball.me:/srv/www/russell-county-archives.daball.me/archives'; // SSH path
const tikaUrl = 'http://solr.services.cleveland.daball.me:9998'; // URL of the Tika instance
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance

// Read the list of files from the text file
const fileLines = fs.readFileSync('file_list.txt', 'utf8').split('\n');

// Filter the list to include only files with certain file extensions
const allowedExtensions = ['.txt', '.html', '.htm', '.md', '.pdf'];
const filesToIndex = fileLines.filter(line => {
    const extension = line.substring(line.lastIndexOf('.')).toLowerCase();
    return allowedExtensions.includes(extension);
});

function extToMime(file_name) {
    switch (path.extname(file_name)) {
        case '.htm':
        case '.html':
            return 'text/html';
        case '.pdf':
            return 'application/pdf';
        case '.md':
        case '.txt':
        default:
            return 'text/plain';
    }
}

const readSshRsaKey = (keyFilePath) => {
    try {
        // Read the contents of the SSH RSA key file
        const key = fs.readFileSync(keyFilePath, 'utf8');
        return key.trim(); // Trim whitespace from the key
    } catch (error) {
        console.error('Error reading SSH RSA key:', error);
        return null;
    }
};

// Example usage
const defaultKeyFilePath =  path.join(os.homedir(), ".ssh", "id_rsa");
const sshRsaKey = readSshRsaKey(defaultKeyFilePath);

// const downloadFileWithRsync = (sourceUrl, destinationPath) => {
//     return new Promise((resolve, reject) => {
//         const rsyncCommand = `rsync -av --progress "${sourceUrl}" "${destinationPath}"`;
//         exec(rsyncCommand, (error, stdout, stderr) => {
//             if (error) {
//                 console.error('Error downloading file with rsync:', stderr);
//                 reject(error);
//             } else {
//                 console.log('File downloaded successfully:', stdout);
//                 resolve();
//             }
//         });
//     });
// };

const parseSftpUrl = (sftpUrl) => {
    const regex = /^(?<username>[^@]+)@(?<host>[^:]+):(?<path>.+)$/;
    const match = sftpUrl.match(regex);
    if (match) {
        return {
            username: match.groups.username,
            host: match.groups.host,
            path: match.groups.path
        };
    } else {
        throw new Error('Invalid SFTP URL format');
    }
};

const downloadFileWithSftp = async (remotePath, localPath, options = {}) => {
    const sftp = new SftpClient();
    try {
        // Connect to the SFTP server
        const sftpUrlParts = parseSftpUrl(remotePath);

        await sftp.connect({
            host: sftpUrlParts.host,
            username: sftpUrlParts.username,
            privateKey: sshRsaKey,
        });

        // Download the file
        const writer = fs.createWriteStream(localPath, { start: 0 });
        sftp.get(sftpUrlParts.path, writer);

        //console.log('File downloaded successfully');
        return new Promise((resolve, reject) => {
            writer.on('finish', resolve);
            writer.on('error', reject);
        });
    } catch (error) {
        console.error('Error downloading file:', error);
    } finally {
        // Disconnect from the SFTP server
        await sftp.end();
    }
};

async function downloadFile(url, filePath) {
    // if (url.startsWith(baseUrl)) {
    //     //downloadFileWithRsync(url.replace(baseUrl, sftpBasePath), filePath);
    //     const sftpUrl = url.replace(baseUrl, sftpBasePath);
	//     console.log(`Downloading: ${sftpUrl}`);
    //     downloadFileWithSftp(sftpUrl, filePath);
	//     console.log(`Download complete: ${sftpUrl} => ${filePath}`);
    // }
    // else {
	    console.log(`Downloading: ${url}`);
        const writer = fs.createWriteStream(filePath, { start: 0 });

        const response = await axios({
            url,
            method: 'GET',
            responseType: 'stream'
        });

        response.data.pipe(writer);

        console.log(`Download complete:  ${url} => ${filePath}`);

        return new Promise((resolve, reject) => {
            writer.on('finish', resolve);
            writer.on('error', reject);
        });
    // }
}

async function getSolrIndexedFileChecksum(url) {

}

async function extractAndIndexWithTika(url, solrUrl) {
    try {
        const tempFilePath = 'tempfile';
        await downloadFile(url, tempFilePath);
        const fileContent = fs.readFileSync(tempFilePath);
        const checksum = crypto.createHash('sha256').update(fileContent).digest('hex');

        // Query Solr to check if the file is already indexed
        const solrChecksumResponse = await axios.get(`${solrUrl}/select?q=id:"${url}"&fl=sha256sum`);
        const solrChecksum = solrChecksumResponse.data.response.docs[0]?.sha256sum;

        if (solrChecksum && solrChecksum === checksum) {
            console.log(`File ${filePath} hasn't changed. Skipping.`);
            return;
        }

        // Make request to Tika
        console.log(`Processing ${url}`);
//         const response = await axios.post(tikaUrl + '/form', formData, {
//             headers: {
// //                'Content-Type': 'application/octet-stream',
// 		        //'Content-Type': extToMime(url),
//                 //'Content-Length': fs.
//                 'Content-Type': 'multipart/form-data',
//                 'X-Tika-Output-Format': 'solr',
//                 //'X-Tika-SolrUrl': solrUrl
//             },
//             timeout: 40000000
//         });
        const client = new TikaClient({ host: tikaUrl });
        const version = await client.getVersion()
        console.info(`version: ${version}`)
        console.info(extToMime(url), await client.getContent(fs.createReadStream(tempFilePath), extToMime(url), path.basename(url)));
        await client.pipe(fs.createReadStream(tempFilePath), fs.createWriteStream('output.txt'), 'text/plain', tempFilePath);

        //console.log('Tika response:', fs.readFileSync('output.txt'));
        const fileData = fs.readFileSync('output.txt');
        //const contentLength = await fs.stat(tempFilePath).size;
        // Parse XML response from Tika

        const textContent = sanitizeIndexData(extractTextFromHtml(fileData));

        // Create Solr document
        const solrDocument = {
            id: url, // Replace with a unique identifier for the document
            text: textContent, // Add the extracted text content
            sha256sum: checksum, // Add the checksum
            //html: response.data,
            url: url,
            //content_length: contentLength,
            content_type: extToMime(url),
            // Add additional fields as needed (e.g., title, author, etc.)
        };

        // Send document to Solr for indexing
        console.log(`Indexing ${url}`);
        await indexDocumentInSolr(solrDocument);

        console.log('Document indexed successfully:', solrDocument.id);

    } catch (error) {
        console.error('Error extracting text with Tika:', error.message);
    }
}

function extractTextFromHtml(html) {
    // Parse HTML using Cheerio
    const $ = cheerio.load(html);

    // Extract text content from HTML
    const textContent = $('body').text().trim();

    return textContent;
}

async function indexDocumentInSolr(document) {
    try {
        // Send document to Solr using the Solr REST API or a Solr client library
        // Example code to send document using Axios:
        await axios.post(solrUrl + '/update/json/docs', document, {
            params: {
                commit: true, // Commit changes immediately
            },
        });
    } catch (error) {
        throw new Error('Error indexing document in Solr: ' + error.message);
    }
}

function sanitizeIndexData(data) {
    // Convert all whitespace characters to spaces
    let sanitizedData = data.replace(/\s+/g, ' ');

    // Remove double whitespace recursively
    while (sanitizedData !== (sanitizedData = sanitizedData.replace(/  /g, ' '))) {}

    return sanitizedData.trim(); // Trim leading and trailing spaces
}

async function clearSolrIndex() {
    try {
        // Send delete query to Solr to delete all documents
        const response = await axios.post(solrUrl + '/update', {
            delete: {
                query: '*:*'
            },
            commit: {}
        }, {
            headers: {
                'Content-Type': 'application/json'
            }
        });

        console.log('Deleted ' + response.data.responseHeader.status + ' documents');
    } catch (error) {
        console.error('Error clearing Solr index:', error.message);
    }
}

// Clears SOLR search index
const index_clear = async function() {
    await clearSolrIndex();
};
index_clear.displayName = 'index:clear';
gulp.task(index_clear);

// Reindexes SOLR search index with plaintext results from Tika
const index_index = async function() {
    for (let l = 0; l < filesToIndex.length; l++) {
        let line = filesToIndex[l];
        let documentUrl = baseUrl + line;
        await extractAndIndexWithTika(documentUrl, solrUrl);
    }
};
index_index.displayName = 'index:index';
gulp.task(index_index);

// Define index tasks
gulp.task('index:reindex', gulp.series('index:clear', 'index:index'));
gulp.task('index', gulp.series('index:index'));
// Define a default task (optional)
gulp.task('default', gulp.series('index'));