const gulp = require('gulp'); const gulpif = require('gulp-if'); const through2 = require('through2'); //const shell = require('gulp-shell'); const axios = require('axios'); const fs = require('fs'); const os = require ("os"); const path = require('path'); const crypto = require('crypto'); const cheerio = require('cheerio'); const { TikaClient } = require('tika-js'); const baseUrl = 'https://no-moss-3-carbo-landfill-library.online'; // URL of the document to download and index const tikaUrl = 'http://solr.services.cleveland.daball.me:9998'; // URL of the Tika instance const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance function extToMime(file_name) { switch (path.extname(file_name)) { case '.htm': case '.html': return 'text/html'; case '.pdf': return 'application/pdf'; case '.md': case '.txt': default: return 'text/plain'; } } function isFileExt(extname) { const fileExtnameMatchesExtname = function(file) { return file.extname == extname; }; return file_extname_matches_extname; } async function tikaReadPdf(file) { const client = new TikaClient({ host: tikaUrl }); const file_reader = fs.createReadStream(file); await client.pipe(file_reader, fs.createWriteStream('output.txt'), 'text/plain', tempFilePath); } function index() { src([ "public/Russell_County_IDA/**.pdf", "public/Russell_County_IDA/**.pdf", //['.txt', '.html', '.htm', '.md', '.pdf'] ]) .pipe(gulpif(isFileExt(".pdf"), )) } async function extractAndIndexWithTika(url, solrUrl) { try { const tempFilePath = 'tempfile'; await downloadFile(url, tempFilePath); const fileContent = fs.readFileSync(tempFilePath); const checksum = crypto.createHash('sha256').update(fileContent).digest('hex'); // Query Solr to check if the file is already indexed const solrChecksumResponse = await axios.get(`${solrUrl}/select?q=id:"${url}"&fl=sha256sum`); const solrChecksum = solrChecksumResponse.data.response.docs[0]?.sha256sum; if (solrChecksum && solrChecksum === checksum) { console.log(`File ${filePath} hasn't changed. Skipping.`); return; } // Make request to Tika console.log(`Processing ${url}`); // const response = await axios.post(tikaUrl + '/form', formData, { // headers: { // // 'Content-Type': 'application/octet-stream', // //'Content-Type': extToMime(url), // //'Content-Length': fs. // 'Content-Type': 'multipart/form-data', // 'X-Tika-Output-Format': 'solr', // //'X-Tika-SolrUrl': solrUrl // }, // timeout: 40000000 // }); const client = new TikaClient({ host: tikaUrl }); const version = await client.getVersion() console.info(`version: ${version}`) console.info(extToMime(url), await client.getContent(fs.createReadStream(tempFilePath), extToMime(url), path.basename(url))); await client.pipe(fs.createReadStream(tempFilePath), fs.createWriteStream('output.txt'), 'text/plain', tempFilePath); //console.log('Tika response:', fs.readFileSync('output.txt')); const fileData = fs.readFileSync('output.txt'); //const contentLength = await fs.stat(tempFilePath).size; // Parse XML response from Tika const textContent = sanitizeIndexData(extractTextFromHtml(fileData)); // Create Solr document const solrDocument = { id: url, // Replace with a unique identifier for the document text: textContent, // Add the extracted text content sha256sum: checksum, // Add the checksum //html: response.data, url: url, //content_length: contentLength, content_type: extToMime(url), // Add additional fields as needed (e.g., title, author, etc.) }; // Send document to Solr for indexing console.log(`Indexing ${url}`); await indexDocumentInSolr(solrDocument); console.log('Document indexed successfully:', solrDocument.id); } catch (error) { console.error('Error extracting text with Tika:', error.message); } } function extractTextFromHtml(html) { // Parse HTML using Cheerio const $ = cheerio.load(html); // Extract text content from HTML const textContent = $('body').text().trim(); return textContent; } async function indexDocumentInSolr(document) { try { // Send document to Solr using the Solr REST API or a Solr client library // Example code to send document using Axios: await axios.post(solrUrl + '/update/json/docs', document, { params: { commit: true, // Commit changes immediately }, }); } catch (error) { throw new Error('Error indexing document in Solr: ' + error.message); } } function sanitizeIndexData(data) { // Convert all whitespace characters to spaces let sanitizedData = data.replace(/\s+/g, ' '); // Remove double whitespace recursively while (sanitizedData !== (sanitizedData = sanitizedData.replace(/ /g, ' '))) {} return sanitizedData.trim(); // Trim leading and trailing spaces } async function clearSolrIndex() { try { // Send delete query to Solr to delete all documents const response = await axios.post(solrUrl + '/update', { delete: { query: '*:*' }, commit: {} }, { headers: { 'Content-Type': 'application/json' } }); console.log('Deleted ' + response.data.responseHeader.status + ' documents'); } catch (error) { console.error('Error clearing Solr index:', error.message); } } // Clears SOLR search index const index_clear = async function() { await clearSolrIndex(); }; index_clear.displayName = 'index:clear'; gulp.task(index_clear); // Reindexes SOLR search index with plaintext results from Tika const index_index = async function() { for (let l = 0; l < filesToIndex.length; l++) { let line = filesToIndex[l]; let documentUrl = baseUrl + line; await extractAndIndexWithTika(documentUrl, solrUrl); } }; index_index.displayName = 'index:index'; gulp.task(index_index); // Define index tasks gulp.task('index:reindex', gulp.series('index:clear', 'index:index')); gulp.task('index', gulp.series('index:index')); // Define a default task (optional) gulp.task('default', gulp.series('index'));