const gulp = require('gulp'); //const shell = require('gulp-shell'); const axios = require('axios'); const fs = require('fs'); const os = require ("os"); const path = require('path'); const crypto = require('crypto'); const cheerio = require('cheerio'); const SftpClient = require('ssh2-sftp-client'); const { TikaClient } = require('tika-js'); const baseUrl = 'https://russell-county-archives.daball.me'; // URL of the document to download and index const sftpBasePath = 'david@caddy.services.cleveland.daball.me:/srv/www/russell-county-archives.daball.me/archives'; // SSH path const tikaUrl = 'http://solr.services.cleveland.daball.me:9998'; // URL of the Tika instance const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance // Read the list of files from the text file const fileLines = fs.readFileSync('file_list.txt', 'utf8').split('\n'); // Filter the list to include only files with certain file extensions const allowedExtensions = ['.txt', '.html', '.htm', '.md', '.pdf']; const filesToIndex = fileLines.filter(line => { const extension = line.substring(line.lastIndexOf('.')).toLowerCase(); return allowedExtensions.includes(extension); }); function extToMime(file_name) { switch (path.extname(file_name)) { case '.htm': case '.html': return 'text/html'; case '.pdf': return 'application/pdf'; case '.md': case '.txt': default: return 'text/plain'; } } const readSshRsaKey = (keyFilePath) => { try { // Read the contents of the SSH RSA key file const key = fs.readFileSync(keyFilePath, 'utf8'); return key.trim(); // Trim whitespace from the key } catch (error) { console.error('Error reading SSH RSA key:', error); return null; } }; // Example usage const defaultKeyFilePath = path.join(os.homedir(), ".ssh", "id_rsa"); const sshRsaKey = readSshRsaKey(defaultKeyFilePath); // const downloadFileWithRsync = (sourceUrl, destinationPath) => { // return new Promise((resolve, reject) => { // const rsyncCommand = `rsync -av --progress "${sourceUrl}" "${destinationPath}"`; // exec(rsyncCommand, (error, stdout, stderr) => { // if (error) { // console.error('Error downloading file with rsync:', stderr); // reject(error); // } else { // console.log('File downloaded successfully:', stdout); // resolve(); // } // }); // }); // }; const parseSftpUrl = (sftpUrl) => { const regex = /^(?[^@]+)@(?[^:]+):(?.+)$/; const match = sftpUrl.match(regex); if (match) { return { username: match.groups.username, host: match.groups.host, path: match.groups.path }; } else { throw new Error('Invalid SFTP URL format'); } }; const downloadFileWithSftp = async (remotePath, localPath, options = {}) => { const sftp = new SftpClient(); try { // Connect to the SFTP server const sftpUrlParts = parseSftpUrl(remotePath); await sftp.connect({ host: sftpUrlParts.host, username: sftpUrlParts.username, privateKey: sshRsaKey, }); // Download the file const writer = fs.createWriteStream(localPath, { start: 0 }); sftp.get(sftpUrlParts.path, writer); //console.log('File downloaded successfully'); return new Promise((resolve, reject) => { writer.on('finish', resolve); writer.on('error', reject); }); } catch (error) { console.error('Error downloading file:', error); } finally { // Disconnect from the SFTP server await sftp.end(); } }; async function downloadFile(url, filePath) { // if (url.startsWith(baseUrl)) { // //downloadFileWithRsync(url.replace(baseUrl, sftpBasePath), filePath); // const sftpUrl = url.replace(baseUrl, sftpBasePath); // console.log(`Downloading: ${sftpUrl}`); // downloadFileWithSftp(sftpUrl, filePath); // console.log(`Download complete: ${sftpUrl} => ${filePath}`); // } // else { console.log(`Downloading: ${url}`); const writer = fs.createWriteStream(filePath, { start: 0 }); const response = await axios({ url, method: 'GET', responseType: 'stream' }); response.data.pipe(writer); console.log(`Download complete: ${url} => ${filePath}`); return new Promise((resolve, reject) => { writer.on('finish', resolve); writer.on('error', reject); }); // } } async function getSolrIndexedFileChecksum(url) { } async function extractAndIndexWithTika(url, solrUrl) { try { const tempFilePath = 'tempfile'; await downloadFile(url, tempFilePath); const fileContent = fs.readFileSync(tempFilePath); const checksum = crypto.createHash('sha256').update(fileContent).digest('hex'); // Query Solr to check if the file is already indexed const solrChecksumResponse = await axios.get(`${solrUrl}/select?q=id:"${url}"&fl=sha256sum`); const solrChecksum = solrChecksumResponse.data.response.docs[0]?.sha256sum; if (solrChecksum && solrChecksum === checksum) { console.log(`File ${filePath} hasn't changed. Skipping.`); return; } // Make request to Tika console.log(`Processing ${url}`); // const response = await axios.post(tikaUrl + '/form', formData, { // headers: { // // 'Content-Type': 'application/octet-stream', // //'Content-Type': extToMime(url), // //'Content-Length': fs. // 'Content-Type': 'multipart/form-data', // 'X-Tika-Output-Format': 'solr', // //'X-Tika-SolrUrl': solrUrl // }, // timeout: 40000000 // }); const client = new TikaClient({ host: tikaUrl }); const version = await client.getVersion() console.info(`version: ${version}`) console.info(extToMime(url), await client.getContent(fs.createReadStream(tempFilePath), extToMime(url), path.basename(url))); await client.pipe(fs.createReadStream(tempFilePath), fs.createWriteStream('output.txt'), 'text/plain', tempFilePath); //console.log('Tika response:', fs.readFileSync('output.txt')); const fileData = fs.readFileSync('output.txt'); //const contentLength = await fs.stat(tempFilePath).size; // Parse XML response from Tika const textContent = sanitizeIndexData(extractTextFromHtml(fileData)); // Create Solr document const solrDocument = { id: url, // Replace with a unique identifier for the document text: textContent, // Add the extracted text content sha256sum: checksum, // Add the checksum //html: response.data, url: url, //content_length: contentLength, content_type: extToMime(url), // Add additional fields as needed (e.g., title, author, etc.) }; // Send document to Solr for indexing console.log(`Indexing ${url}`); await indexDocumentInSolr(solrDocument); console.log('Document indexed successfully:', solrDocument.id); } catch (error) { console.error('Error extracting text with Tika:', error.message); } } function extractTextFromHtml(html) { // Parse HTML using Cheerio const $ = cheerio.load(html); // Extract text content from HTML const textContent = $('body').text().trim(); return textContent; } async function indexDocumentInSolr(document) { try { // Send document to Solr using the Solr REST API or a Solr client library // Example code to send document using Axios: await axios.post(solrUrl + '/update/json/docs', document, { params: { commit: true, // Commit changes immediately }, }); } catch (error) { throw new Error('Error indexing document in Solr: ' + error.message); } } function sanitizeIndexData(data) { // Convert all whitespace characters to spaces let sanitizedData = data.replace(/\s+/g, ' '); // Remove double whitespace recursively while (sanitizedData !== (sanitizedData = sanitizedData.replace(/ /g, ' '))) {} return sanitizedData.trim(); // Trim leading and trailing spaces } async function clearSolrIndex() { try { // Send delete query to Solr to delete all documents const response = await axios.post(solrUrl + '/update', { delete: { query: '*:*' }, commit: {} }, { headers: { 'Content-Type': 'application/json' } }); console.log('Deleted ' + response.data.responseHeader.status + ' documents'); } catch (error) { console.error('Error clearing Solr index:', error.message); } } // Clears SOLR search index const index_clear = async function() { await clearSolrIndex(); }; index_clear.displayName = 'index:clear'; gulp.task(index_clear); // Reindexes SOLR search index with plaintext results from Tika const index_index = async function() { for (let l = 0; l < filesToIndex.length; l++) { let line = filesToIndex[l]; let documentUrl = baseUrl + line; await extractAndIndexWithTika(documentUrl, solrUrl); } }; index_index.displayName = 'index:index'; gulp.task(index_index); // Define index tasks gulp.task('index:reindex', gulp.series('index:clear', 'index:index')); gulp.task('index', gulp.series('index:index')); // Define a default task (optional) gulp.task('default', gulp.series('index'));