nm3clol-express-app/index/gulpfile.js

195 lines
6.6 KiB
JavaScript

const gulp = require('gulp');
const gulpif = require('gulp-if');
const through2 = require('through2');
//const shell = require('gulp-shell');
const axios = require('axios');
const fs = require('fs');
const os = require ("os");
const path = require('path');
const crypto = require('crypto');
const cheerio = require('cheerio');
const { TikaClient } = require('tika-js');
const baseUrl = 'https://no-moss-3-carbo-landfill-library.online'; // URL of the document to download and index
const tikaUrl = 'http://solr.services.cleveland.daball.me:9998'; // URL of the Tika instance
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance
function extToMime(file_name) {
switch (path.extname(file_name)) {
case '.htm':
case '.html':
return 'text/html';
case '.pdf':
return 'application/pdf';
case '.md':
case '.txt':
default:
return 'text/plain';
}
}
function isFileExt(extname) {
const fileExtnameMatchesExtname = function(file) {
return file.extname == extname;
};
return file_extname_matches_extname;
}
async function tikaReadPdf(file) {
const client = new TikaClient({ host: tikaUrl });
const file_reader = fs.createReadStream(file);
await client.pipe(file_reader, fs.createWriteStream('output.txt'), 'text/plain', tempFilePath);
}
function index() {
src([
"public/Russell_County_IDA/**.pdf",
"public/Russell_County_IDA/**.pdf",
//['.txt', '.html', '.htm', '.md', '.pdf']
])
.pipe(gulpif(isFileExt(".pdf"), ))
}
async function extractAndIndexWithTika(url, solrUrl) {
try {
const tempFilePath = 'tempfile';
await downloadFile(url, tempFilePath);
const fileContent = fs.readFileSync(tempFilePath);
const checksum = crypto.createHash('sha256').update(fileContent).digest('hex');
// Query Solr to check if the file is already indexed
const solrChecksumResponse = await axios.get(`${solrUrl}/select?q=id:"${url}"&fl=sha256sum`);
const solrChecksum = solrChecksumResponse.data.response.docs[0]?.sha256sum;
if (solrChecksum && solrChecksum === checksum) {
console.log(`File ${filePath} hasn't changed. Skipping.`);
return;
}
// Make request to Tika
console.log(`Processing ${url}`);
// const response = await axios.post(tikaUrl + '/form', formData, {
// headers: {
// // 'Content-Type': 'application/octet-stream',
// //'Content-Type': extToMime(url),
// //'Content-Length': fs.
// 'Content-Type': 'multipart/form-data',
// 'X-Tika-Output-Format': 'solr',
// //'X-Tika-SolrUrl': solrUrl
// },
// timeout: 40000000
// });
const client = new TikaClient({ host: tikaUrl });
const version = await client.getVersion()
console.info(`version: ${version}`)
console.info(extToMime(url), await client.getContent(fs.createReadStream(tempFilePath), extToMime(url), path.basename(url)));
await client.pipe(fs.createReadStream(tempFilePath), fs.createWriteStream('output.txt'), 'text/plain', tempFilePath);
//console.log('Tika response:', fs.readFileSync('output.txt'));
const fileData = fs.readFileSync('output.txt');
//const contentLength = await fs.stat(tempFilePath).size;
// Parse XML response from Tika
const textContent = sanitizeIndexData(extractTextFromHtml(fileData));
// Create Solr document
const solrDocument = {
id: url, // Replace with a unique identifier for the document
text: textContent, // Add the extracted text content
sha256sum: checksum, // Add the checksum
//html: response.data,
url: url,
//content_length: contentLength,
content_type: extToMime(url),
// Add additional fields as needed (e.g., title, author, etc.)
};
// Send document to Solr for indexing
console.log(`Indexing ${url}`);
await indexDocumentInSolr(solrDocument);
console.log('Document indexed successfully:', solrDocument.id);
} catch (error) {
console.error('Error extracting text with Tika:', error.message);
}
}
function extractTextFromHtml(html) {
// Parse HTML using Cheerio
const $ = cheerio.load(html);
// Extract text content from HTML
const textContent = $('body').text().trim();
return textContent;
}
async function indexDocumentInSolr(document) {
try {
// Send document to Solr using the Solr REST API or a Solr client library
// Example code to send document using Axios:
await axios.post(solrUrl + '/update/json/docs', document, {
params: {
commit: true, // Commit changes immediately
},
});
} catch (error) {
throw new Error('Error indexing document in Solr: ' + error.message);
}
}
function sanitizeIndexData(data) {
// Convert all whitespace characters to spaces
let sanitizedData = data.replace(/\s+/g, ' ');
// Remove double whitespace recursively
while (sanitizedData !== (sanitizedData = sanitizedData.replace(/ /g, ' '))) {}
return sanitizedData.trim(); // Trim leading and trailing spaces
}
async function clearSolrIndex() {
try {
// Send delete query to Solr to delete all documents
const response = await axios.post(solrUrl + '/update', {
delete: {
query: '*:*'
},
commit: {}
}, {
headers: {
'Content-Type': 'application/json'
}
});
console.log('Deleted ' + response.data.responseHeader.status + ' documents');
} catch (error) {
console.error('Error clearing Solr index:', error.message);
}
}
// Clears SOLR search index
const index_clear = async function() {
await clearSolrIndex();
};
index_clear.displayName = 'index:clear';
gulp.task(index_clear);
// Reindexes SOLR search index with plaintext results from Tika
const index_index = async function() {
for (let l = 0; l < filesToIndex.length; l++) {
let line = filesToIndex[l];
let documentUrl = baseUrl + line;
await extractAndIndexWithTika(documentUrl, solrUrl);
}
};
index_index.displayName = 'index:index';
gulp.task(index_index);
// Define index tasks
gulp.task('index:reindex', gulp.series('index:clear', 'index:index'));
gulp.task('index', gulp.series('index:index'));
// Define a default task (optional)
gulp.task('default', gulp.series('index'));