const gulp = require('gulp'); const request = require('request-promise-native'); const axios = require('axios'); const glob = require('glob'); const fs = require('fs'); const path = require('path'); const crypto = require('crypto'); const url = require('url') const { TikaClient } = require('./app/TikaClient/build'); const { Readable, Writable } = require('stream'); const relPathToFiles = './public'; const baseUrl = 'https://no-moss-3-carbo-landfill-library.online'; // URL of the document to download and index const tikaUrl = 'http://solr.services.cleveland.daball.me:9998'; // URL of the Tika instance const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance // Task to clear out previous Solr data gulp.task('index:clear', async () => { await request({ uri: `${solrUrl}/update?commit=true`, method: 'POST', body: { delete: { query: '*:*' } }, // Delete all documents json: true, }); }); async function calculateSHA256Hash(filePath) { return new Promise((resolve, reject) => { const readStream = fs.createReadStream(filePath); const hash = crypto.createHash('sha256'); readStream.on('data', (chunk) => { hash.update(chunk); }); readStream.on('end', () => { const sha256Hash = hash.digest('hex'); resolve(sha256Hash); }); readStream.on('error', (error) => { reject(error); }); }); } // Function to retrieve metadata of a file from Solr async function retrieveMetadataFromSolr(url) { // Retrieve metadata from Solr based on the file URL or unique identifier // const response = await axios.get(`${solrUrl}/select?q=id:"${encodeURIComponent(url)}"&fl=${encodeURIComponent('sha256sum, content_length')}`, { // responseType: 'json' // }); const fl = encodeURIComponent("sha256sum, content_length"); const q = encodeURIComponent("id:")+"\""+encodeURIComponent(url)+"\"";//encodeURIComponent(`id:"${url}"`); const uri = `${solrUrl}/select?q=${q}&fl=${fl}`; const response = await request({ uri: `${uri}`, json: true }); return response && response.response && response.response.docs && response.response.docs[0]; } async function indexDocumentInSolr(document) { try { // Send document to Solr using the Solr REST API or a Solr client library // Example code to send document using Axios: await axios.post(solrUrl + '/update/json/docs', document, { params: { commit: true, // Commit changes immediately }, }); } catch (error) { throw new Error('Error indexing document in Solr: ' + error.message); } } function extToMime(file_name) { switch (path.extname(file_name)) { case '.htm': case '.html': return 'text/html'; case '.pdf': return 'application/pdf'; case '.md': case '.txt': default: return 'text/plain'; } } // Task to index files into Solr gulp.task('index:docs', async () => { let globs = [ 'Potesta_&_Associates/**/*.{pdf, docx, jpg, png, txt}', // 'Russell_County_BOS/Documents/**/*.{pdf, docx, jpg, png, txt}', 'Russell_County_BOS/Meetings/**/*.{pdf, docx, jpg, png, txt}', 'Russell_County_BOS/Ordinances/**/*.{pdf, docx, jpg, png, txt}', 'Russell_County_IDA/Meetings/**/*.{pdf, docx, jpg, png, txt}', 'Russell_County_Tourism/Agenda/**/*.{pdf, docx, jpg, png, txt}', 'Russell_County_Tourism/Minutes/**/*.{pdf, docx, jpg, png, txt}', 'United_Mine_Workers_of_America/**/*.{pdf, docx, jpg, png, txt}', 'Virginia_Energy/**/*.{pdf, docx, jpg, png, txt}', ]; // Use glob to match files in the local directories let files = []; let cwd = path.resolve(__dirname, relPathToFiles.replaceAll('/', path.sep)); globs.forEach(async (globPattern) => { files = files.concat(glob.globSync(globPattern, { cwd, matchBase: true, follow: true, })); }); console.log(`Found ${files.length} files to index using ${globs.length} glob patterns.`); // Loop through each file and process them for (let f = 0; f < files.length; f++) { const file = files[f]; console.log(`${f+1}/${files.length}: ${file}`); const fileFullPath = path.join(cwd, file); const url = `https://no-moss-3-carbo-landfill-library.online/${file.replaceAll(path.sep, '/')}`; console.log('URL: ' + url); // Retrieve metadata of the file from Solr (if it exists) const metadata = await retrieveMetadataFromSolr(url); // Calculate file size const stats = fs.statSync(fileFullPath); const fileSize = stats.size; // Calculate SHA256 checksum // const checksum = crypto.createHash('sha256').update(fileContents).digest('hex'); const checksum = await calculateSHA256Hash(fileFullPath); // Compare metadata if (!metadata || parseInt(metadata.content_length[0]) != fileSize || metadata.sha256sum[0] != checksum) { // Metadata mismatch or file not found in Solr, proceed with indexing console.log(`Processing text from file using Tika.`); const client = new TikaClient({ host: tikaUrl }); const version = await client.getVersion(); console.info(`Tika Server Version: ${version}`); // Create a Readable stream for the file contents let f = fs.createReadStream(fileFullPath); // Create a writable stream to capture the extracted text content into a string let extractedText = ''; const writableStream = new Writable({ write(chunk, encoding, callback) { extractedText += chunk.toString(); // Append the chunk to the extracted text callback(); } }); // Use the TikaClient's pipe method to extract text content await client.pipe(f, writableStream, 'text/plain', path.basename(file)); console.log("Extracted Text:", extractedText); // Create Solr document const solrDocument = { id: url, // Replace with a unique identifier for the document text: extractedText, // Add the extracted text content sha256sum: checksum, // Add the checksum //html: response.data, url: url, content_length: fileSize, content_type: extToMime(url), // Add additional fields as needed (e.g., title, author, etc.) }; // Send document to Solr for indexing // Index the file with its text content and metadata console.log(`Indexing ${url}`); await indexDocumentInSolr(solrDocument); // Continue console.log(`Done.`); } else { // Metadata matches, skip the file console.log(`Skipping file '${file}' as metadata matches existing metadata in Solr index.`); } } }); // Task to optionally run both clearing and indexing gulp.task('index:reindex', gulp.series('index:clear', 'index:docs')); // Default task to run indexing gulp.task('index', gulp.series('index:docs')); // Default task to run indexing gulp.task('default', gulp.series('index'));