forked from nm3clol/nm3clol-express-app
144 lines
4.5 KiB
JavaScript
144 lines
4.5 KiB
JavaScript
const axios = require('axios');
|
|
const fs = require('fs');
|
|
const cheerio = require('cheerio');
|
|
|
|
// Read the list of files from the text file
|
|
const fileLines = fs.readFileSync('file_list.txt', 'utf8').split('\n');
|
|
|
|
// Filter the list to include only files with certain file extensions
|
|
const allowedExtensions = ['.txt', '.html', '.htm', '.md', '.pdf']; // Add more extensions as needed
|
|
const filesToIndex = fileLines.filter(line => {
|
|
const extension = line.substring(line.lastIndexOf('.')).toLowerCase();
|
|
return allowedExtensions.includes(extension);
|
|
});
|
|
|
|
async function downloadFile(url, filePath) {
|
|
const writer = fs.createWriteStream(filePath);
|
|
|
|
const response = await axios({
|
|
url,
|
|
method: 'GET',
|
|
responseType: 'stream'
|
|
});
|
|
|
|
response.data.pipe(writer);
|
|
|
|
return new Promise((resolve, reject) => {
|
|
writer.on('finish', resolve);
|
|
writer.on('error', reject);
|
|
});
|
|
}
|
|
|
|
async function extractAndIndexWithTika(url, solrUrl) {
|
|
try {
|
|
const tempFilePath = 'tempfile';
|
|
console.log(`Downloading ${url}`);
|
|
await downloadFile(url, tempFilePath);
|
|
console.log(`Downloaded ${url}.`);
|
|
|
|
// Read file contents
|
|
const fileData = fs.readFileSync(tempFilePath);
|
|
|
|
// Make request to Tika
|
|
const response = await axios.put(tikaUrl, fileData, {
|
|
headers: {
|
|
// 'Content-Type': 'application/octet-stream',
|
|
'Content-Type': 'application/pdf',
|
|
'X-Tika-Output-Format': 'solr',
|
|
'X-Tika-SolrUrl': solrUrl
|
|
}
|
|
});
|
|
|
|
console.log('Tika response:', response.data);
|
|
// Parse XML response from Tika
|
|
|
|
const textContent = sanitizeIndexData(extractTextFromHtml(response.data));
|
|
|
|
// Create Solr document
|
|
const solrDocument = {
|
|
id: documentUrl, // Replace with a unique identifier for the document
|
|
text: textContent, // Add the extracted text content
|
|
html: response.data,
|
|
url: url,
|
|
content_length: textContent.length,
|
|
content_type: "application/pdf",
|
|
// Add additional fields as needed (e.g., title, author, etc.)
|
|
};
|
|
|
|
// Send document to Solr for indexing
|
|
await indexDocumentInSolr(solrDocument);
|
|
|
|
console.log('Document indexed successfully:', solrDocument.id);
|
|
|
|
} catch (error) {
|
|
console.error('Error extracting text with Tika:', error.message);
|
|
}
|
|
}
|
|
|
|
function extractTextFromHtml(html) {
|
|
// Parse HTML using Cheerio
|
|
const $ = cheerio.load(html);
|
|
|
|
// Extract text content from HTML
|
|
const textContent = $('body').text().trim();
|
|
|
|
return textContent;
|
|
}
|
|
|
|
async function indexDocumentInSolr(document) {
|
|
try {
|
|
// Send document to Solr using the Solr REST API or a Solr client library
|
|
// Example code to send document using Axios:
|
|
await axios.post(solrUrl + '/update/json/docs', document, {
|
|
params: {
|
|
commit: true, // Commit changes immediately
|
|
},
|
|
});
|
|
} catch (error) {
|
|
throw new Error('Error indexing document in Solr: ' + error.message);
|
|
}
|
|
}
|
|
|
|
function sanitizeIndexData(data) {
|
|
// Convert all whitespace characters to spaces
|
|
let sanitizedData = data.replace(/\s+/g, ' ');
|
|
|
|
// Remove double whitespace recursively
|
|
while (sanitizedData !== (sanitizedData = sanitizedData.replace(/ /g, ' '))) {}
|
|
|
|
return sanitizedData.trim(); // Trim leading and trailing spaces
|
|
}
|
|
|
|
async function clearSolrIndex() {
|
|
try {
|
|
// Send delete query to Solr to delete all documents
|
|
const response = await axios.post(solrUrl + '/update', {
|
|
delete: {
|
|
query: '*:*'
|
|
},
|
|
commit: {}
|
|
}, {
|
|
headers: {
|
|
'Content-Type': 'application/json'
|
|
}
|
|
});
|
|
|
|
console.log('Deleted ' + response.data.responseHeader.status + ' documents');
|
|
} catch (error) {
|
|
console.error('Error clearing Solr index:', error.message);
|
|
}
|
|
}
|
|
|
|
// Example usage
|
|
const tikaUrl = 'http://solr.services.cleveland.daball.me:9998/tika'; // URL of the Tika instance
|
|
const baseUrl = 'https://russell-county-archives.daball.me'; // URL of the document to download and index
|
|
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance
|
|
|
|
// Call the function to clear the Solr index
|
|
clearSolrIndex();
|
|
|
|
for (let l = 0; l < filesToIndex.length; l++) {
|
|
let line = filesToIndex[l];
|
|
let documentUrl = baseUrl + line;
|
|
extractAndIndexWithTika(documentUrl, solrUrl);
|
|
} |