forked from nm3clol/nm3clol-express-app
289 lines
9.7 KiB
JavaScript
289 lines
9.7 KiB
JavaScript
const gulp = require('gulp');
|
|
//const shell = require('gulp-shell');
|
|
const axios = require('axios');
|
|
const fs = require('fs');
|
|
const os = require ("os");
|
|
const path = require('path');
|
|
const crypto = require('crypto');
|
|
const cheerio = require('cheerio');
|
|
const SftpClient = require('ssh2-sftp-client');
|
|
const { TikaClient } = require('tika-js');
|
|
|
|
const baseUrl = 'https://russell-county-archives.daball.me'; // URL of the document to download and index
|
|
const sftpBasePath = 'david@caddy.services.cleveland.daball.me:/srv/www/russell-county-archives.daball.me/archives'; // SSH path
|
|
const tikaUrl = 'http://solr.services.cleveland.daball.me:9998'; // URL of the Tika instance
|
|
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance
|
|
|
|
// Read the list of files from the text file
|
|
const fileLines = fs.readFileSync('file_list.txt', 'utf8').split('\n');
|
|
|
|
// Filter the list to include only files with certain file extensions
|
|
const allowedExtensions = ['.txt', '.html', '.htm', '.md', '.pdf'];
|
|
const filesToIndex = fileLines.filter(line => {
|
|
const extension = line.substring(line.lastIndexOf('.')).toLowerCase();
|
|
return allowedExtensions.includes(extension);
|
|
});
|
|
|
|
function extToMime(file_name) {
|
|
switch (path.extname(file_name)) {
|
|
case '.htm':
|
|
case '.html':
|
|
return 'text/html';
|
|
case '.pdf':
|
|
return 'application/pdf';
|
|
case '.md':
|
|
case '.txt':
|
|
default:
|
|
return 'text/plain';
|
|
}
|
|
}
|
|
|
|
const readSshRsaKey = (keyFilePath) => {
|
|
try {
|
|
// Read the contents of the SSH RSA key file
|
|
const key = fs.readFileSync(keyFilePath, 'utf8');
|
|
return key.trim(); // Trim whitespace from the key
|
|
} catch (error) {
|
|
console.error('Error reading SSH RSA key:', error);
|
|
return null;
|
|
}
|
|
};
|
|
|
|
// Example usage
|
|
const defaultKeyFilePath = path.join(os.homedir(), ".ssh", "id_rsa");
|
|
const sshRsaKey = readSshRsaKey(defaultKeyFilePath);
|
|
|
|
// const downloadFileWithRsync = (sourceUrl, destinationPath) => {
|
|
// return new Promise((resolve, reject) => {
|
|
// const rsyncCommand = `rsync -av --progress "${sourceUrl}" "${destinationPath}"`;
|
|
// exec(rsyncCommand, (error, stdout, stderr) => {
|
|
// if (error) {
|
|
// console.error('Error downloading file with rsync:', stderr);
|
|
// reject(error);
|
|
// } else {
|
|
// console.log('File downloaded successfully:', stdout);
|
|
// resolve();
|
|
// }
|
|
// });
|
|
// });
|
|
// };
|
|
|
|
const parseSftpUrl = (sftpUrl) => {
|
|
const regex = /^(?<username>[^@]+)@(?<host>[^:]+):(?<path>.+)$/;
|
|
const match = sftpUrl.match(regex);
|
|
if (match) {
|
|
return {
|
|
username: match.groups.username,
|
|
host: match.groups.host,
|
|
path: match.groups.path
|
|
};
|
|
} else {
|
|
throw new Error('Invalid SFTP URL format');
|
|
}
|
|
};
|
|
|
|
const downloadFileWithSftp = async (remotePath, localPath, options = {}) => {
|
|
const sftp = new SftpClient();
|
|
try {
|
|
// Connect to the SFTP server
|
|
const sftpUrlParts = parseSftpUrl(remotePath);
|
|
|
|
await sftp.connect({
|
|
host: sftpUrlParts.host,
|
|
username: sftpUrlParts.username,
|
|
privateKey: sshRsaKey,
|
|
});
|
|
|
|
// Download the file
|
|
const writer = fs.createWriteStream(localPath, { start: 0 });
|
|
sftp.get(sftpUrlParts.path, writer);
|
|
|
|
//console.log('File downloaded successfully');
|
|
return new Promise((resolve, reject) => {
|
|
writer.on('finish', resolve);
|
|
writer.on('error', reject);
|
|
});
|
|
} catch (error) {
|
|
console.error('Error downloading file:', error);
|
|
} finally {
|
|
// Disconnect from the SFTP server
|
|
await sftp.end();
|
|
}
|
|
};
|
|
|
|
async function downloadFile(url, filePath) {
|
|
// if (url.startsWith(baseUrl)) {
|
|
// //downloadFileWithRsync(url.replace(baseUrl, sftpBasePath), filePath);
|
|
// const sftpUrl = url.replace(baseUrl, sftpBasePath);
|
|
// console.log(`Downloading: ${sftpUrl}`);
|
|
// downloadFileWithSftp(sftpUrl, filePath);
|
|
// console.log(`Download complete: ${sftpUrl} => ${filePath}`);
|
|
// }
|
|
// else {
|
|
console.log(`Downloading: ${url}`);
|
|
const writer = fs.createWriteStream(filePath, { start: 0 });
|
|
|
|
const response = await axios({
|
|
url,
|
|
method: 'GET',
|
|
responseType: 'stream'
|
|
});
|
|
|
|
response.data.pipe(writer);
|
|
|
|
console.log(`Download complete: ${url} => ${filePath}`);
|
|
|
|
return new Promise((resolve, reject) => {
|
|
writer.on('finish', resolve);
|
|
writer.on('error', reject);
|
|
});
|
|
// }
|
|
}
|
|
|
|
async function getSolrIndexedFileChecksum(url) {
|
|
|
|
}
|
|
|
|
async function extractAndIndexWithTika(url, solrUrl) {
|
|
try {
|
|
const tempFilePath = 'tempfile';
|
|
await downloadFile(url, tempFilePath);
|
|
const fileContent = fs.readFileSync(tempFilePath);
|
|
const checksum = crypto.createHash('sha256').update(fileContent).digest('hex');
|
|
|
|
// Query Solr to check if the file is already indexed
|
|
const solrChecksumResponse = await axios.get(`${solrUrl}/select?q=id:"${url}"&fl=sha256sum`);
|
|
const solrChecksum = solrChecksumResponse.data.response.docs[0]?.sha256sum;
|
|
|
|
if (solrChecksum && solrChecksum === checksum) {
|
|
console.log(`File ${filePath} hasn't changed. Skipping.`);
|
|
return;
|
|
}
|
|
|
|
// Make request to Tika
|
|
console.log(`Processing ${url}`);
|
|
// const response = await axios.post(tikaUrl + '/form', formData, {
|
|
// headers: {
|
|
// // 'Content-Type': 'application/octet-stream',
|
|
// //'Content-Type': extToMime(url),
|
|
// //'Content-Length': fs.
|
|
// 'Content-Type': 'multipart/form-data',
|
|
// 'X-Tika-Output-Format': 'solr',
|
|
// //'X-Tika-SolrUrl': solrUrl
|
|
// },
|
|
// timeout: 40000000
|
|
// });
|
|
const client = new TikaClient({ host: tikaUrl });
|
|
const version = await client.getVersion()
|
|
console.info(`version: ${version}`)
|
|
console.info(extToMime(url), await client.getContent(fs.createReadStream(tempFilePath), extToMime(url), path.basename(url)));
|
|
await client.pipe(fs.createReadStream(tempFilePath), fs.createWriteStream('output.txt'), 'text/plain', tempFilePath);
|
|
|
|
//console.log('Tika response:', fs.readFileSync('output.txt'));
|
|
const fileData = fs.readFileSync('output.txt');
|
|
//const contentLength = await fs.stat(tempFilePath).size;
|
|
// Parse XML response from Tika
|
|
|
|
const textContent = sanitizeIndexData(extractTextFromHtml(fileData));
|
|
|
|
// Create Solr document
|
|
const solrDocument = {
|
|
id: url, // Replace with a unique identifier for the document
|
|
text: textContent, // Add the extracted text content
|
|
sha256sum: checksum, // Add the checksum
|
|
//html: response.data,
|
|
url: url,
|
|
//content_length: contentLength,
|
|
content_type: extToMime(url),
|
|
// Add additional fields as needed (e.g., title, author, etc.)
|
|
};
|
|
|
|
// Send document to Solr for indexing
|
|
console.log(`Indexing ${url}`);
|
|
await indexDocumentInSolr(solrDocument);
|
|
|
|
console.log('Document indexed successfully:', solrDocument.id);
|
|
|
|
} catch (error) {
|
|
console.error('Error extracting text with Tika:', error.message);
|
|
}
|
|
}
|
|
|
|
function extractTextFromHtml(html) {
|
|
// Parse HTML using Cheerio
|
|
const $ = cheerio.load(html);
|
|
|
|
// Extract text content from HTML
|
|
const textContent = $('body').text().trim();
|
|
|
|
return textContent;
|
|
}
|
|
|
|
async function indexDocumentInSolr(document) {
|
|
try {
|
|
// Send document to Solr using the Solr REST API or a Solr client library
|
|
// Example code to send document using Axios:
|
|
await axios.post(solrUrl + '/update/json/docs', document, {
|
|
params: {
|
|
commit: true, // Commit changes immediately
|
|
},
|
|
});
|
|
} catch (error) {
|
|
throw new Error('Error indexing document in Solr: ' + error.message);
|
|
}
|
|
}
|
|
|
|
function sanitizeIndexData(data) {
|
|
// Convert all whitespace characters to spaces
|
|
let sanitizedData = data.replace(/\s+/g, ' ');
|
|
|
|
// Remove double whitespace recursively
|
|
while (sanitizedData !== (sanitizedData = sanitizedData.replace(/ /g, ' '))) {}
|
|
|
|
return sanitizedData.trim(); // Trim leading and trailing spaces
|
|
}
|
|
|
|
async function clearSolrIndex() {
|
|
try {
|
|
// Send delete query to Solr to delete all documents
|
|
const response = await axios.post(solrUrl + '/update', {
|
|
delete: {
|
|
query: '*:*'
|
|
},
|
|
commit: {}
|
|
}, {
|
|
headers: {
|
|
'Content-Type': 'application/json'
|
|
}
|
|
});
|
|
|
|
console.log('Deleted ' + response.data.responseHeader.status + ' documents');
|
|
} catch (error) {
|
|
console.error('Error clearing Solr index:', error.message);
|
|
}
|
|
}
|
|
|
|
// Clears SOLR search index
|
|
const index_clear = async function() {
|
|
await clearSolrIndex();
|
|
};
|
|
index_clear.displayName = 'index:clear';
|
|
gulp.task(index_clear);
|
|
|
|
// Reindexes SOLR search index with plaintext results from Tika
|
|
const index_index = async function() {
|
|
for (let l = 0; l < filesToIndex.length; l++) {
|
|
let line = filesToIndex[l];
|
|
let documentUrl = baseUrl + line;
|
|
await extractAndIndexWithTika(documentUrl, solrUrl);
|
|
}
|
|
};
|
|
index_index.displayName = 'index:index';
|
|
gulp.task(index_index);
|
|
|
|
// Define index tasks
|
|
gulp.task('index:reindex', gulp.series('index:clear', 'index:index'));
|
|
gulp.task('index', gulp.series('index:index'));
|
|
// Define a default task (optional)
|
|
gulp.task('default', gulp.series('index'));
|