nm3clol-express-app/index/example_tika_upload.js

const axios = require('axios');
const fs = require('fs');
const cheerio = require('cheerio');

// Read the list of files from the text file
const fileLines = fs.readFileSync('file_list.txt', 'utf8').split('\n');

// Filter the list to include only files with certain file extensions
const allowedExtensions = ['.txt', '.html', '.htm', '.md', '.pdf']; // Add more extensions as needed
const filesToIndex = fileLines.filter(line => {
    const extension = line.substring(line.lastIndexOf('.')).toLowerCase();
    return allowedExtensions.includes(extension);
});

async function downloadFile(url, filePath) {
    const writer = fs.createWriteStream(filePath);

    const response = await axios({
        url,
        method: 'GET',
        responseType: 'stream'
    });

    response.data.pipe(writer);

    return new Promise((resolve, reject) => {
        writer.on('finish', resolve);
        writer.on('error', reject);
    });
}

async function extractAndIndexWithTika(url, solrUrl) {
    try {
        const tempFilePath = 'tempfile';
	console.log(`Downloading ${url}`);
        await downloadFile(url, tempFilePath);
	console.log(`Downloaded ${url}.`);

        // Read file contents
        const fileData = fs.readFileSync(tempFilePath);

        // Make request to Tika
        const response = await axios.put(tikaUrl, fileData, {
            headers: {
//                'Content-Type': 'application/octet-stream',
		'Content-Type': 'application/pdf',
                'X-Tika-Output-Format': 'solr',
                'X-Tika-SolrUrl': solrUrl
            }
        });

        console.log('Tika response:', response.data);
        // Parse XML response from Tika

        const textContent = sanitizeIndexData(extractTextFromHtml(response.data));

        // Create Solr document
        const solrDocument = {
            id: documentUrl, // Replace with a unique identifier for the document
            text: textContent, // Add the extracted text content
            html: response.data,
            url: url,
            content_length: textContent.length,
            content_type: "application/pdf",
            // Add additional fields as needed (e.g., title, author, etc.)
        };

        // Send document to Solr for indexing
        await indexDocumentInSolr(solrDocument);

        console.log('Document indexed successfully:', solrDocument.id);

    } catch (error) {
        console.error('Error extracting text with Tika:', error.message);
    }
}

function extractTextFromHtml(html) {
    // Parse HTML using Cheerio
    const $ = cheerio.load(html);

    // Extract text content from HTML
    const textContent = $('body').text().trim();

    return textContent;
}

async function indexDocumentInSolr(document) {
    try {
        // Send document to Solr using the Solr REST API or a Solr client library
        // Example code to send document using Axios:
        await axios.post(solrUrl + '/update/json/docs', document, {
            params: {
                commit: true, // Commit changes immediately
            },
        });
    } catch (error) {
        throw new Error('Error indexing document in Solr: ' + error.message);
    }
}

function sanitizeIndexData(data) {
    // Convert all whitespace characters to spaces
    let sanitizedData = data.replace(/\s+/g, ' ');

    // Remove double whitespace recursively
    while (sanitizedData !== (sanitizedData = sanitizedData.replace(/  /g, ' '))) {}

    return sanitizedData.trim(); // Trim leading and trailing spaces
}

async function clearSolrIndex() {
    try {
        // Send delete query to Solr to delete all documents
        const response = await axios.post(solrUrl + '/update', {
            delete: {
                query: '*:*'
            },
            commit: {}
        }, {
            headers: {
                'Content-Type': 'application/json'
            }
        });

        console.log('Deleted ' + response.data.responseHeader.status + ' documents');
    } catch (error) {
        console.error('Error clearing Solr index:', error.message);
    }
}

// Example usage
const tikaUrl = 'http://solr.services.cleveland.daball.me:9998/tika'; // URL of the Tika instance
const baseUrl = 'https://russell-county-archives.daball.me'; // URL of the document to download and index
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance

// Call the function to clear the Solr index
clearSolrIndex();

for (let l = 0; l < filesToIndex.length; l++) {
	let line = filesToIndex[l];
	let documentUrl = baseUrl + line;
	extractAndIndexWithTika(documentUrl, solrUrl);
}