const axios = require('axios'); const fs = require('fs'); const cheerio = require('cheerio'); // Read the list of files from the text file const fileLines = fs.readFileSync('file_list.txt', 'utf8').split('\n'); // Filter the list to include only files with certain file extensions const allowedExtensions = ['.txt', '.html', '.htm', '.md', '.pdf']; // Add more extensions as needed const filesToIndex = fileLines.filter(line => { const extension = line.substring(line.lastIndexOf('.')).toLowerCase(); return allowedExtensions.includes(extension); }); async function downloadFile(url, filePath) { const writer = fs.createWriteStream(filePath); const response = await axios({ url, method: 'GET', responseType: 'stream' }); response.data.pipe(writer); return new Promise((resolve, reject) => { writer.on('finish', resolve); writer.on('error', reject); }); } async function extractAndIndexWithTika(url, solrUrl) { try { const tempFilePath = 'tempfile'; console.log(`Downloading ${url}`); await downloadFile(url, tempFilePath); console.log(`Downloaded ${url}.`); // Read file contents const fileData = fs.readFileSync(tempFilePath); // Make request to Tika const response = await axios.put(tikaUrl, fileData, { headers: { // 'Content-Type': 'application/octet-stream', 'Content-Type': 'application/pdf', 'X-Tika-Output-Format': 'solr', 'X-Tika-SolrUrl': solrUrl } }); console.log('Tika response:', response.data); // Parse XML response from Tika const textContent = sanitizeIndexData(extractTextFromHtml(response.data)); // Create Solr document const solrDocument = { id: documentUrl, // Replace with a unique identifier for the document text: textContent, // Add the extracted text content html: response.data, url: url, content_length: textContent.length, content_type: "application/pdf", // Add additional fields as needed (e.g., title, author, etc.) }; // Send document to Solr for indexing await indexDocumentInSolr(solrDocument); console.log('Document indexed successfully:', solrDocument.id); } catch (error) { console.error('Error extracting text with Tika:', error.message); } } function extractTextFromHtml(html) { // Parse HTML using Cheerio const $ = cheerio.load(html); // Extract text content from HTML const textContent = $('body').text().trim(); return textContent; } async function indexDocumentInSolr(document) { try { // Send document to Solr using the Solr REST API or a Solr client library // Example code to send document using Axios: await axios.post(solrUrl + '/update/json/docs', document, { params: { commit: true, // Commit changes immediately }, }); } catch (error) { throw new Error('Error indexing document in Solr: ' + error.message); } } function sanitizeIndexData(data) { // Convert all whitespace characters to spaces let sanitizedData = data.replace(/\s+/g, ' '); // Remove double whitespace recursively while (sanitizedData !== (sanitizedData = sanitizedData.replace(/ /g, ' '))) {} return sanitizedData.trim(); // Trim leading and trailing spaces } async function clearSolrIndex() { try { // Send delete query to Solr to delete all documents const response = await axios.post(solrUrl + '/update', { delete: { query: '*:*' }, commit: {} }, { headers: { 'Content-Type': 'application/json' } }); console.log('Deleted ' + response.data.responseHeader.status + ' documents'); } catch (error) { console.error('Error clearing Solr index:', error.message); } } // Example usage const tikaUrl = 'http://solr.services.cleveland.daball.me:9998/tika'; // URL of the Tika instance const baseUrl = 'https://russell-county-archives.daball.me'; // URL of the document to download and index const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance // Call the function to clear the Solr index clearSolrIndex(); for (let l = 0; l < filesToIndex.length; l++) { let line = filesToIndex[l]; let documentUrl = baseUrl + line; extractAndIndexWithTika(documentUrl, solrUrl); }