forked from nm3clol/nm3clol-express-app
366 lines
14 KiB
JavaScript
366 lines
14 KiB
JavaScript
const gulp = require('gulp');
|
|
const request = require('request-promise-native');
|
|
const axios = require('axios');
|
|
const glob = require('glob');
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
const crypto = require('crypto');
|
|
const url = require('url')
|
|
const { TikaClient } = require('./app/TikaClient/build');
|
|
const { Readable, Writable } = require('stream');
|
|
|
|
const relPathToFiles = './public';
|
|
const baseUrl = 'https://no-moss-3-carbo-landfill-library.online'; // URL of the document to download and index
|
|
const tikaUrl = 'http://solr.services.cleveland.daball.me:9998'; // URL of the Tika instance
|
|
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance
|
|
const solrVirginiaLawUrl = 'http://solr.services.cleveland.daball.me:8983/solr/va_code'; // URL of your Solr instance
|
|
|
|
// Task to clear out previous Solr data
|
|
gulp.task('index:clear', async () => {
|
|
await request({
|
|
uri: `${solrUrl}/update?commit=true`,
|
|
method: 'POST',
|
|
body: { delete: { query: '*:*' } }, // Delete all documents
|
|
json: true,
|
|
});
|
|
});
|
|
|
|
gulp.task('dbfromsolr', async () => {
|
|
let docs = await request({
|
|
uri: `${solrUrl}/select`,
|
|
qs: {
|
|
q: '*:*',
|
|
wt: 'json',
|
|
rows: 10000,
|
|
},
|
|
json: true
|
|
});
|
|
docs = docs?.response?.docs?.map(({id, sha256sum, url, content_length, content_type, text, _version_}) =>
|
|
{
|
|
return {
|
|
id,
|
|
url: url.join(''),
|
|
content_length: parseInt(content_length.join()),
|
|
sha256sum: sha256sum.join(''),
|
|
content_type: content_type.join(''),
|
|
text: text.join(''),
|
|
_version_,
|
|
}
|
|
}).map(doc => {
|
|
|
|
})
|
|
});
|
|
|
|
async function calculateSHA256Hash(filePath) {
|
|
return new Promise((resolve, reject) => {
|
|
const readStream = fs.createReadStream(filePath);
|
|
const hash = crypto.createHash('sha256');
|
|
|
|
readStream.on('data', (chunk) => {
|
|
hash.update(chunk);
|
|
});
|
|
|
|
readStream.on('end', () => {
|
|
const sha256Hash = hash.digest('hex');
|
|
resolve(sha256Hash);
|
|
});
|
|
|
|
readStream.on('error', (error) => {
|
|
reject(error);
|
|
});
|
|
});
|
|
}
|
|
|
|
// Function to retrieve metadata of a file from Solr
|
|
async function retrieveVirginiaLawMetadataFromSolr(url) {
|
|
// Retrieve metadata from Solr based on the file URL or unique identifier
|
|
// const response = await axios.get(`${solrUrl}/select?q=id:"${encodeURIComponent(url)}"&fl=${encodeURIComponent('sha256sum, content_length')}`, {
|
|
// responseType: 'json'
|
|
// });
|
|
const fl = encodeURIComponent("sha256sum, content_length");
|
|
const q = encodeURIComponent("id:")+"\""+encodeURIComponent(url)+"\"";//encodeURIComponent(`id:"${url}"`);
|
|
const uri = `${solrVirginiaLawUrl}/select?q=${q}&fl=${fl}`;
|
|
const response = await request({ uri: `${uri}`, json: true });
|
|
return response && response.response && response.response.docs && response.response.docs[0];
|
|
}
|
|
|
|
// Function to retrieve metadata of a file from Solr
|
|
async function retrieveMetadataFromSolr(url) {
|
|
// Retrieve metadata from Solr based on the file URL or unique identifier
|
|
// const response = await axios.get(`${solrUrl}/select?q=id:"${encodeURIComponent(url)}"&fl=${encodeURIComponent('sha256sum, content_length')}`, {
|
|
// responseType: 'json'
|
|
// });
|
|
const fl = encodeURIComponent("sha256sum, content_length");
|
|
const q = encodeURIComponent("id:")+"\""+encodeURIComponent(url)+"\"";//encodeURIComponent(`id:"${url}"`);
|
|
const uri = `${solrUrl}/select?q=${q}&fl=${fl}`;
|
|
const response = await request({ uri: `${uri}`, json: true });
|
|
return response && response.response && response.response.docs && response.response.docs[0];
|
|
}
|
|
|
|
async function indexDocumentInSolr(document) {
|
|
try {
|
|
// Send document to Solr using the Solr REST API or a Solr client library
|
|
// Example code to send document using Axios:
|
|
await axios.post(solrUrl + '/update/json/docs', document, {
|
|
params: {
|
|
commit: true, // Commit changes immediately
|
|
},
|
|
});
|
|
} catch (error) {
|
|
throw new Error('Error indexing document in Solr: ' + error.message);
|
|
}
|
|
}
|
|
|
|
async function indexLawDocumentInSolr(document) {
|
|
try {
|
|
// Send document to Solr using the Solr REST API or a Solr client library
|
|
// Example code to send document using Axios:
|
|
await axios.post(solrVirginiaLawUrl + '/update/json/docs', document, {
|
|
params: {
|
|
commit: true, // Commit changes immediately
|
|
},
|
|
});
|
|
} catch (error) {
|
|
throw new Error('Error indexing document in Solr: ' + error.message);
|
|
}
|
|
}
|
|
|
|
function extToMime(file_name) {
|
|
switch (path.extname(file_name)) {
|
|
case '.htm':
|
|
case '.html':
|
|
return 'text/html';
|
|
case '.pdf':
|
|
return 'application/pdf';
|
|
case '.md':
|
|
case '.txt':
|
|
case '.mkv':
|
|
return 'video/x-matroska';
|
|
default:
|
|
return 'text/plain';
|
|
}
|
|
}
|
|
|
|
|
|
// Task to index files into Solr
|
|
gulp.task('index:laws', async () => {
|
|
//let scanExts = ''; //set to empty string to scan all
|
|
let scanExts = '.{pdf,docx,pptx,xlsx,jpg,png,txt}';
|
|
let globs = [
|
|
`Russell_County/Ordinances/**/*${scanExts}`,
|
|
`Virginia_Law_Library/**/*${scanExts}`,
|
|
];
|
|
// Use glob to match files in the local directories
|
|
let files = [];
|
|
let cwd = path.resolve(__dirname, relPathToFiles.replaceAll('/', path.sep));
|
|
globs.forEach(async (globPattern) => {
|
|
files = files.concat(glob.globSync(globPattern, {
|
|
cwd,
|
|
matchBase: true,
|
|
follow: true,
|
|
}));
|
|
});
|
|
console.log(`Found ${files.length} files to index using ${globs.length} glob patterns.`);
|
|
// Loop through each file and process them
|
|
for (let f = 0; f < files.length; f++) {
|
|
const file = files[f];
|
|
console.log(`${f+1}/${files.length}: ${file}`);
|
|
|
|
const fileFullPath = path.join(cwd, file);
|
|
|
|
const url = `https://no-moss-3-carbo-landfill-library.online/${file.replaceAll(path.sep, '/')}`;
|
|
console.log('URL: ' + url);
|
|
|
|
// Retrieve metadata of the file from Solr (if it exists)
|
|
const metadata = await retrieveVirginiaLawMetadataFromSolr(url);
|
|
|
|
// Calculate file size
|
|
const stats = fs.statSync(fileFullPath);
|
|
const fileSize = stats.size;
|
|
|
|
// Calculate SHA256 checksum
|
|
// const checksum = crypto.createHash('sha256').update(fileContents).digest('hex');
|
|
const checksum = await calculateSHA256Hash(fileFullPath);
|
|
|
|
// Compare metadata
|
|
if (!metadata || parseInt(metadata.content_length[0]) != fileSize || metadata.sha256sum[0] != checksum) {
|
|
// Metadata mismatch or file not found in Solr, proceed with indexing
|
|
console.log(`Processing text from file using Tika.`);
|
|
const client = new TikaClient({ host: tikaUrl });
|
|
const version = await client.getVersion();
|
|
console.info(`Tika Server Version: ${version}`);
|
|
|
|
// Create a Readable stream for the file contents
|
|
let f = fs.createReadStream(fileFullPath);
|
|
|
|
// Create a writable stream to capture the extracted text content into a string
|
|
let extractedText = '';
|
|
const writableStream = new Writable({
|
|
write(chunk, encoding, callback) {
|
|
extractedText += chunk.toString(); // Append the chunk to the extracted text
|
|
callback();
|
|
}
|
|
});
|
|
|
|
// Use the TikaClient's pipe method to extract text content
|
|
await client.pipe(f, writableStream, 'text/plain', encodeURI(path.basename(file)));
|
|
console.log("Extracted Text:", extractedText);
|
|
|
|
// Create Solr document
|
|
const solrDocument = {
|
|
id: url, // Replace with a unique identifier for the document
|
|
text: extractedText, // Add the extracted text content
|
|
sha256sum: checksum, // Add the checksum
|
|
//html: response.data,
|
|
url: url,
|
|
content_length: fileSize,
|
|
content_type: extToMime(url),
|
|
// Add additional fields as needed (e.g., title, author, etc.)
|
|
};
|
|
|
|
// Send document to Solr for indexing
|
|
// Index the file with its text content and metadata
|
|
console.log(`Indexing ${url}`);
|
|
await indexLawDocumentInSolr(solrDocument);
|
|
|
|
// Continue
|
|
console.log(`Done.`);
|
|
} else {
|
|
// Metadata matches, skip the file
|
|
console.log(`Skipping file '${file}' as metadata matches existing metadata in Solr index.`);
|
|
}
|
|
}
|
|
});
|
|
|
|
// Task to index files into Solr
|
|
gulp.task('index:docs', async () => {
|
|
//let scanExts = ''; //set to empty string to scan all
|
|
let scanExts = '.{pdf,docx,pptx,xlsx,jpg,png,txt,mkv}';
|
|
let globs = [
|
|
`Amys_Drop_Box/**/*${scanExts}`,
|
|
`CRS_Reports/**/*${scanExts}`,
|
|
`Mine_Safety_and_Health_Administration/**/*${scanExts}`,
|
|
`Potesta_&_Associates/**/*${scanExts}`,
|
|
`Russell_County/**/*${scanExts}`,
|
|
`Russell_County_Reclamation_LLC/**/*${scanExts}`,
|
|
`Tobacco_Region_Revitalization_Commission/**/*${scanExts}`,
|
|
`United_Mine_Workers_of_America/**/*${scanExts}`,
|
|
`Virginia_Energy/**/*${scanExts}`,
|
|
// I want to put Virginia Law in its own search category first.
|
|
// `Virginia_Law_Library/**/*${scanExts}`,
|
|
];
|
|
// Use glob to match files in the local directories
|
|
let files = [];
|
|
let cwd = path.resolve(__dirname, relPathToFiles.replaceAll('/', path.sep));
|
|
globs.forEach(async (globPattern) => {
|
|
files = files.concat(glob.globSync(globPattern, {
|
|
cwd,
|
|
matchBase: true,
|
|
follow: true,
|
|
}));
|
|
});
|
|
console.log(`Found ${files.length} files to index using ${globs.length} glob patterns.`);
|
|
// Loop through each file and process them
|
|
for (let f = 0; f < files.length; f++) {
|
|
const file = files[f];
|
|
console.log(`${f+1}/${files.length}: ${file}`);
|
|
|
|
const fileFullPath = path.join(cwd, file);
|
|
|
|
let url = `https://no-moss-3-carbo-landfill-library.online/${file.replaceAll(path.sep, '/')}`;
|
|
console.log('URL: ' + url);
|
|
|
|
// Retrieve metadata of the file from Solr (if it exists)
|
|
const metadata = await retrieveMetadataFromSolr(url);
|
|
|
|
// Calculate file size
|
|
const stats = fs.statSync(fileFullPath);
|
|
const fileSize = stats.size;
|
|
|
|
// Calculate SHA256 checksum
|
|
// const checksum = crypto.createHash('sha256').update(fileContents).digest('hex');
|
|
const checksum = await calculateSHA256Hash(fileFullPath);
|
|
|
|
// Compare metadata
|
|
if (!metadata || parseInt(metadata.content_length[0]) != fileSize || metadata.sha256sum[0] != checksum) {
|
|
// Metadata mismatch or file not found in Solr, proceed with indexing
|
|
console.log(`Processing text from file using Tika.`);
|
|
const client = new TikaClient({ host: tikaUrl });
|
|
const version = await client.getVersion();
|
|
console.info(`Tika Server Version: ${version}`);
|
|
|
|
let extractedText = '';
|
|
|
|
let subtitleExt = ".en.vtt";
|
|
if (url.endsWith(".webm") || url.endsWith(".mkv") || url.endsWith(".mpg") || url.endsWith(".mpeg") || url.endsWith(".mp4")) {
|
|
let subtitleFilePath = fileFullPath.substring(0, fileFullPath.lastIndexOf('.')) + subtitleExt;
|
|
if (fs.existsSync(subtitleFilePath)) {
|
|
console.log("Found VTT subtitle file at:", subtitleFilePath);
|
|
extractedText = fs.readFileSync(subtitleFilePath, 'utf8');
|
|
url = url.substring(0, url.lastIndexOf('/')+1);
|
|
}
|
|
else {
|
|
console.log("No subtitles found at: ", subtitleFilePath);
|
|
console.log("Skipping this video file. Not adding this to the index until subtitles are available.")
|
|
continue;
|
|
}
|
|
}
|
|
else {
|
|
// Create a Readable stream for the file contents
|
|
let f = fs.createReadStream(fileFullPath);
|
|
// Create a writable stream to capture the extracted text content into a string
|
|
const writableStream = new Writable({
|
|
write(chunk, encoding, callback) {
|
|
extractedText += chunk.toString(); // Append the chunk to the extracted text
|
|
callback();
|
|
}
|
|
});
|
|
// Use the TikaClient's pipe method to extract text content
|
|
await client.pipe(f, writableStream, 'text/plain', encodeURI(path.basename(file)));
|
|
}
|
|
if (!extractedText) {
|
|
console.log("Skipping document because no text was detected.");
|
|
continue;
|
|
}
|
|
else if (extractedText.length < 100) {
|
|
console.log("Extracted Text:", extractedText);
|
|
}
|
|
else {
|
|
console.log("Extracted Text (excerpt):", extractedText.substring(0, 99));
|
|
}
|
|
|
|
// Create Solr document
|
|
const solrDocument = {
|
|
id: url, // Replace with a unique identifier for the document
|
|
text: extractedText, // Add the extracted text content
|
|
sha256sum: checksum, // Add the checksum
|
|
//html: response.data,
|
|
url: url,
|
|
content_length: fileSize,
|
|
content_type: extToMime(url),
|
|
// Add additional fields as needed (e.g., title, author, etc.)
|
|
};
|
|
|
|
// Send document to Solr for indexing
|
|
// Index the file with its text content and metadata
|
|
console.log(`Indexing ${url}`);
|
|
await indexDocumentInSolr(solrDocument);
|
|
|
|
// Continue
|
|
console.log(`Done.`);
|
|
} else {
|
|
// Metadata matches, skip the file
|
|
console.log(`Skipping file '${file}' as metadata matches existing metadata in Solr index.`);
|
|
}
|
|
}
|
|
});
|
|
|
|
// Default task to run indexing
|
|
gulp.task('index', gulp.series('index:docs', 'index:laws'));
|
|
|
|
// Task to optionally run both clearing and indexing
|
|
gulp.task('index:reindex', gulp.series('index:clear', 'index'));
|
|
|
|
// Default task to run indexing
|
|
gulp.task('default', gulp.series('index'));
|