nm3clol-express-app/index/example_tika_upload.js

144 lines
4.5 KiB
JavaScript

const axios = require('axios');
const fs = require('fs');
const cheerio = require('cheerio');
// Read the list of files from the text file
const fileLines = fs.readFileSync('file_list.txt', 'utf8').split('\n');
// Filter the list to include only files with certain file extensions
const allowedExtensions = ['.txt', '.html', '.htm', '.md', '.pdf']; // Add more extensions as needed
const filesToIndex = fileLines.filter(line => {
const extension = line.substring(line.lastIndexOf('.')).toLowerCase();
return allowedExtensions.includes(extension);
});
async function downloadFile(url, filePath) {
const writer = fs.createWriteStream(filePath);
const response = await axios({
url,
method: 'GET',
responseType: 'stream'
});
response.data.pipe(writer);
return new Promise((resolve, reject) => {
writer.on('finish', resolve);
writer.on('error', reject);
});
}
async function extractAndIndexWithTika(url, solrUrl) {
try {
const tempFilePath = 'tempfile';
console.log(`Downloading ${url}`);
await downloadFile(url, tempFilePath);
console.log(`Downloaded ${url}.`);
// Read file contents
const fileData = fs.readFileSync(tempFilePath);
// Make request to Tika
const response = await axios.put(tikaUrl, fileData, {
headers: {
// 'Content-Type': 'application/octet-stream',
'Content-Type': 'application/pdf',
'X-Tika-Output-Format': 'solr',
'X-Tika-SolrUrl': solrUrl
}
});
console.log('Tika response:', response.data);
// Parse XML response from Tika
const textContent = sanitizeIndexData(extractTextFromHtml(response.data));
// Create Solr document
const solrDocument = {
id: documentUrl, // Replace with a unique identifier for the document
text: textContent, // Add the extracted text content
html: response.data,
url: url,
content_length: textContent.length,
content_type: "application/pdf",
// Add additional fields as needed (e.g., title, author, etc.)
};
// Send document to Solr for indexing
await indexDocumentInSolr(solrDocument);
console.log('Document indexed successfully:', solrDocument.id);
} catch (error) {
console.error('Error extracting text with Tika:', error.message);
}
}
function extractTextFromHtml(html) {
// Parse HTML using Cheerio
const $ = cheerio.load(html);
// Extract text content from HTML
const textContent = $('body').text().trim();
return textContent;
}
async function indexDocumentInSolr(document) {
try {
// Send document to Solr using the Solr REST API or a Solr client library
// Example code to send document using Axios:
await axios.post(solrUrl + '/update/json/docs', document, {
params: {
commit: true, // Commit changes immediately
},
});
} catch (error) {
throw new Error('Error indexing document in Solr: ' + error.message);
}
}
function sanitizeIndexData(data) {
// Convert all whitespace characters to spaces
let sanitizedData = data.replace(/\s+/g, ' ');
// Remove double whitespace recursively
while (sanitizedData !== (sanitizedData = sanitizedData.replace(/ /g, ' '))) {}
return sanitizedData.trim(); // Trim leading and trailing spaces
}
async function clearSolrIndex() {
try {
// Send delete query to Solr to delete all documents
const response = await axios.post(solrUrl + '/update', {
delete: {
query: '*:*'
},
commit: {}
}, {
headers: {
'Content-Type': 'application/json'
}
});
console.log('Deleted ' + response.data.responseHeader.status + ' documents');
} catch (error) {
console.error('Error clearing Solr index:', error.message);
}
}
// Example usage
const tikaUrl = 'http://solr.services.cleveland.daball.me:9998/tika'; // URL of the Tika instance
const baseUrl = 'https://russell-county-archives.daball.me'; // URL of the document to download and index
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance
// Call the function to clear the Solr index
clearSolrIndex();
for (let l = 0; l < filesToIndex.length; l++) {
let line = filesToIndex[l];
let documentUrl = baseUrl + line;
extractAndIndexWithTika(documentUrl, solrUrl);
}