nm3clol-express-app/gulpfile.mts

430 lines
18 KiB
TypeScript

import gulp from 'gulp';
import request from 'request-promise-native';
import axios from 'axios';
import { glob } from 'glob';
import fs, { WriteStream } from 'fs';
import path from 'path';
import crypto from 'crypto';
import url from 'url';
import { TikaClient } from './app/tika/index.mjs';
import { Readable, Writable } from 'stream';
import dotenv from 'dotenv';
import dotenvExpand from 'dotenv-expand';
import process from 'process';
import { config } from './app/config.mjs';
import { SolrDocument } from './app/search/solr-doc.mjs';
// import ts from 'gulp-typescript';
import run from 'gulp-run';
import { deleteAsync } from 'del';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
gulp.task('build:typescript:compile', () => {
// const tsProject = ts.createProject('tsconfig.build.json');
// return tsProject.src().pipe(tsProject()).js.pipe(gulp.dest('../dist'));
return run('tsc -project tsconfig.build.json', {
verbosity: 3
}).exec();
});
gulp.task('build:clean', async () => {
return await deleteAsync(['../dist/**/*', '!../dist/gulpfile.mjs', '!../dist/gulpfile.d.mts']);
});
gulp.task('build:copy:tsconfig', () => {
return gulp.src(['../tsconfig.build.json', '../tsconfig.json']).pipe(gulp.dest('../dist'));
});
gulp.task('build:copy:assets', async () => {
return await gulp.src('../assets/**/*').pipe(gulp.dest('../dist/assets'));
});
gulp.task('build:copy:pages', async () => {
return await gulp.src('../pages/**/*').pipe(gulp.dest('../dist/pages'))
});
gulp.task('build:copy:views', async () => {
return await gulp.src('../app/views/**/*').pipe(gulp.dest('../dist/views'))
});
gulp.task('build', gulp.series('build:clean', 'build:copy:tsconfig', 'build:copy:pages', 'build:copy:assets', 'build:copy:views', 'build:typescript:compile'));
// Task to clear out previous Solr data
gulp.task('index:clear', async () => {
await request({
uri: `${config.solrDocsUrl}/update?commit=true`,
method: 'POST',
body: { delete: { query: '*:*' } }, // Delete all documents
json: true,
});
});
// gulp.task('dbfromsolr', async () => {
// let docs = await request({
// uri: `${config.solrDocsUrl}/select`,
// qs: {
// q: '*:*',
// wt: 'json',
// rows: 10000,
// },
// json: true
// });
// docs = docs?.response?.docs?.map(({id, sha256sum, url, content_length, content_type, text, _version_}: SolrDocument) =>
// {
// return {
// id,
// url: url.join(''),
// content_length: parseInt(content_length.join()),
// sha256sum: sha256sum.join(''),
// content_type: content_type.join(''),
// text: text.join(''),
// _version_,
// }
// }).map(doc => {
// })
// });
async function calculateSHA256Hash(filePath: string) {
return new Promise<string>((resolve, reject) => {
const readStream = fs.createReadStream(filePath);
const hash = crypto.createHash('sha256');
readStream.on('data', (chunk) => {
hash.update(chunk);
});
readStream.on('end', () => {
const sha256Hash: string = hash.digest('hex');
resolve(sha256Hash);
});
readStream.on('error', (error) => {
reject(error);
});
});
}
// Function to retrieve metadata of a file from Solr
async function retrieveVirginiaLawMetadataFromSolr(url: string) {
// Retrieve metadata from Solr based on the file URL or unique identifier
// const response = await axios.get(`${solrUrl}/select?q=id:"${encodeURIComponent(url)}"&fl=${encodeURIComponent('sha256sum, content_length')}`, {
// responseType: 'json'
// });
const fl = encodeURIComponent("sha256sum, content_length");
const q = encodeURIComponent("id:")+"\""+encodeURIComponent(url)+"\"";//encodeURIComponent(`id:"${url}"`);
const uri = `${config.solrLawUrl}/select?q=${q}&fl=${fl}`;
const response = await request({ uri: `${uri}`, json: true });
return response && response.response && response.response.docs && response.response.docs[0];
}
// Function to retrieve metadata of a file from Solr
async function retrieveMetadataFromSolr(url: string) {
// Retrieve metadata from Solr based on the file URL or unique identifier
// const response = await axios.get(`${solrUrl}/select?q=id:"${encodeURIComponent(url)}"&fl=${encodeURIComponent('sha256sum, content_length')}`, {
// responseType: 'json'
// });
const fl = encodeURIComponent("sha256sum, content_length");
const q = encodeURIComponent("id:")+"\""+encodeURIComponent(url)+"\"";//encodeURIComponent(`id:"${url}"`);
const uri = `${config.solrDocsUrl}/select?q=${q}&fl=${fl}`;
const response = await request({ uri: `${uri}`, json: true });
return response && response.response && response.response.docs && response.response.docs[0];
}
async function indexDocumentInSolr(document: SolrDocument) {
try {
// Send document to Solr using the Solr REST API or a Solr client library
// Example code to send document using Axios:
await axios.post(config.solrDocsUrl + '/update/json/docs', document, {
params: {
commit: true, // Commit changes immediately
},
});
} catch (error) {
if (error && (error as Error).message)
throw new Error('Error indexing document in Solr: ' + (error as Error).message);
}
}
async function indexLawDocumentInSolr(document: SolrDocument) {
try {
// Send document to Solr using the Solr REST API or a Solr client library
// Example code to send document using Axios:
await axios.post(config.solrLawUrl + '/update/json/docs', document, {
params: {
commit: true, // Commit changes immediately
},
});
} catch (error) {
if (error && (error as Error).message)
throw new Error('Error indexing document in Solr: ' + (error as Error).message);
}
}
function extToMime(fileName: string) {
switch (path.extname(fileName)) {
case '.htm':
case '.html':
return 'text/html';
case '.pdf':
return 'application/pdf';
case '.md':
case '.txt':
case '.mkv':
return 'video/x-matroska';
default:
return 'text/plain';
}
}
// Task to index files into Solr
gulp.task('index:laws', async () => {
//let scanExts = ''; //set to empty string to scan all
let scanExts = '.{pdf,docx,pptx,xlsx,jpg,png,txt}';
let globs = [
`Russell_County/Ordinances/**/*${scanExts}`,
`Virginia_Law_Library/**/*${scanExts}`,
];
// Use glob to match files in the local directories
let files: string[] = [];
let cwd = path.resolve(__dirname, config.publicPath.replaceAll('/', path.sep));
globs.forEach(async (globPattern) => {
files = files.concat(glob.globSync(globPattern, {
cwd,
matchBase: true,
follow: true,
}));
});
console.log(`Found ${files.length} files to index using ${globs.length} glob patterns.`);
// Loop through each file and process them
for (let f = 0; f < files.length; f++) {
const file = files[f];
console.log(`${f+1}/${files.length}: ${file}`);
const fileFullPath = path.join(cwd, file);
const url = `${config.siteUrl}/${file.replaceAll(path.sep, '/')}`;
console.log('URL: ' + url);
// Retrieve metadata of the file from Solr (if it exists)
const metadata = await retrieveVirginiaLawMetadataFromSolr(url);
// Calculate file size
const stats = fs.statSync(fileFullPath);
const fileSize = stats.size;
// Calculate SHA256 checksum
// const checksum = crypto.createHash('sha256').update(fileContents).digest('hex');
const checksum = await calculateSHA256Hash(fileFullPath);
// Compare metadata
if (!metadata || parseInt(metadata.content_length[0]) != fileSize || metadata.sha256sum[0] != checksum) {
// Metadata mismatch or file not found in Solr, proceed with indexing
console.log(`Processing text from file using Tika.`);
const client = new TikaClient({ host: config.tikaUrl });
const version = await client.getVersion();
console.info(`Tika Server Version: ${version}`);
// Create a Readable stream for the file contents
let f = fs.createReadStream(fileFullPath);
// Create a writable stream to capture the extracted text content into a string
let extractedText = '';
const writableStream = new Writable({
write(chunk, encoding, callback) {
extractedText += chunk.toString(); // Append the chunk to the extracted text
callback();
}
});
// Use the TikaClient's pipe method to extract text content
await client.pipe(f, writableStream, 'text/plain', encodeURI(path.basename(file)));
console.log("Extracted Text:", extractedText);
// Create Solr document
const solrDocument: SolrDocument = {
id: url, // Replace with a unique identifier for the document
text: extractedText, // Add the extracted text content
sha256sum: checksum, // Add the checksum
//html: response.data,
url: url,
content_length: fileSize,
content_type: extToMime(url),
// Add additional fields as needed (e.g., title, author, etc.)
};
// Send document to Solr for indexing
// Index the file with its text content and metadata
console.log(`Indexing ${url}`);
await indexLawDocumentInSolr(solrDocument);
// Continue
console.log(`Done.`);
} else {
// Metadata matches, skip the file
console.log(`Skipping file '${file}' as metadata matches existing metadata in Solr index.`);
}
}
});
// Task to index files into Solr
gulp.task('index:docs', async () => {
//let scanExts = ''; //set to empty string to scan all
let scanExts = '.{pdf,docx,pptx,xlsx,jpg,png,txt,mkv}';
let globs = [
`2024-02-16 FOIA Response/*${scanExts}`, `2024-02-16 FOIA Response/**/*${scanExts}`,
`Amys_Drop_Box/*${scanExts}`, `Amys_Drop_Box/**/*${scanExts}`,
`CRS_Reports/*${scanExts}`, `CRS_Reports/**/*${scanExts}`,
`Cumberland_Plateau_Planning_District_Commission/*${scanExts}`, `Cumberland_Plateau_Planning_District_Commission/**/*${scanExts}`,
`David_Eaton_District_4_-_Supervisor_Screenshot_Collection/*${scanExts}`, `David_Eaton_District_4_-_Supervisor_Screenshot_Collection/**/*${scanExts}`,
`Environmental_Protection_Agency/*${scanExts}`, `Environmental_Protection_Agency/**/*${scanExts}`,
`Gentry_Locke/*${scanExts}`, `Gentry_Locke/**/*${scanExts}`,
`Mine_Safety_and_Health_Administration/*${scanExts}`, `Mine_Safety_and_Health_Administration/**/*${scanExts}`,
`News/*${scanExts}`, `News/**/*${scanExts}`,
`Potesta_&_Associates/*${scanExts}`, `Potesta_&_Associates/**/*${scanExts}`,
`Russell_County/*${scanExts}`, `Russell_County/**/*${scanExts}`,
`Scott_County/*${scanExts}`, `Scott_County/**/*${scanExts}`,
`Tobacco_Region_Revitalization_Commission/*${scanExts}`, `Tobacco_Region_Revitalization_Commission/**/*${scanExts}`,
`United_Mine_Workers_of_America/*${scanExts}`, `United_Mine_Workers_of_America/**/*${scanExts}`,
`Virginia_Energy/*${scanExts}`, `Virginia_Energy/**/*${scanExts}`,
`Virginia_Governor/*${scanExts}`, `Virginia_Governor/**/*${scanExts}`,
`Virginia_Tech/*${scanExts}`, `Virginia_Tech/**/*${scanExts}`,
// I want to put Virginia Law in its own search category first.
// `Virginia_Law_Library/**/*${scanExts}`,
];
// Use glob to match files in the local directories
let files: string[] = [];
let cwd = path.resolve(__dirname, '..', config.publicPath.replaceAll('/', path.sep));
globs.forEach(async (globPattern) => {
files = files.concat(glob.globSync(globPattern, {
cwd,
matchBase: true,
follow: true,
}));
});
console.log(`Found ${files.length} files to index using ${globs.length} glob patterns.`);
// Loop through each file and process them
for (let f = 0; f < files.length; f++) {
const file = files[f];
console.log(`${f+1}/${files.length}: ${file}`);
const fileFullPath = path.join(cwd, file);
let url = `${config.siteUrl}/${file.replaceAll(path.sep, '/')}`;
console.log('URL: ' + url);
// Retrieve metadata of the file from Solr (if it exists)
const metadata = await retrieveMetadataFromSolr(url);
// Calculate file size
const stats = fs.statSync(fileFullPath);
const fileSize = stats.size;
// Calculate SHA256 checksum
// const checksum = crypto.createHash('sha256').update(fileContents).digest('hex');
const checksum = await calculateSHA256Hash(fileFullPath);
// Compare metadata
if (!metadata || parseInt(metadata.content_length[0]) != fileSize || metadata.sha256sum[0] != checksum) {
// Metadata mismatch or file not found in Solr, proceed with indexing
console.log(`Processing text from file using Tika.`);
const client = new TikaClient({ host: config.tikaUrl });
const version = await client.getVersion();
console.info(`Tika Server Version: ${version}`);
let extractedText = '';
let subtitleExt = ".en.vtt";
if (url.endsWith(".webm") || url.endsWith(".mkv") || url.endsWith(".mpg") || url.endsWith(".mpeg") || url.endsWith(".mp4")) {
let subtitleFilePath = fileFullPath.substring(0, fileFullPath.lastIndexOf('.')) + subtitleExt;
if (fs.existsSync(subtitleFilePath)) {
console.log("Found VTT subtitle file at:", subtitleFilePath);
extractedText = fs.readFileSync(subtitleFilePath, 'utf8');
url = url.substring(0, url.lastIndexOf('/')+1);
}
else {
console.log("No subtitles found at: ", subtitleFilePath);
console.log("Skipping this video file. Not adding this to the index until subtitles are available.")
continue;
}
}
else {
// Create a Readable stream for the file contents
let f = fs.createReadStream(fileFullPath);
// Create a writable stream to capture the extracted text content into a string
const writableStream = new Writable({
write(chunk, encoding, callback) {
extractedText += chunk.toString(); // Append the chunk to the extracted text
callback();
}
});
// Use the TikaClient's pipe method to extract text content
await client.pipe(f, writableStream, 'text/plain', encodeURI(path.basename(file)));
}
if (!extractedText) {
console.log("Skipping document because no text was detected.");
continue;
}
else if (extractedText.length < 100) {
console.log("Extracted Text:", extractedText);
}
else {
console.log("Extracted Text (excerpt):", extractedText.substring(0, 99));
}
// Create Solr document
const solrDocument = {
id: url, // Replace with a unique identifier for the document
text: extractedText, // Add the extracted text content
sha256sum: checksum, // Add the checksum
//html: response.data,
url: url,
content_length: fileSize,
content_type: extToMime(url),
// Add additional fields as needed (e.g., title, author, etc.)
};
// Save a copy of the text into a file
// const fileName = fileFullPath + ".md";
// try {
// fs.writeFileSync(fileName, extractedText);
// console.log(`File ${fileName} has been written successfully!`);
// } catch (err) {
// console.error('Error writing to file', err);
// }
// Send document to Solr for indexing
// Index the file with its text content and metadata
console.log(`Indexing ${url}`);
await indexDocumentInSolr(solrDocument);
// Continue
console.log(`Done.`);
} else {
console.log('prior metadata:', metadata);
// Save a copy of the text into a file
// const fileName = fileFullPath + ".md";
// try {
// fs.writeFileSync(fileName, extractedText);
// console.log(`File ${fileName} has been written successfully!`);
// } catch (err) {
// console.error('Error writing to file', err);
// }
// Metadata matches, skip the file
console.log(`Skipping file '${file}' as metadata matches existing metadata in Solr index.`);
}
}
});
// Default task to run indexing
gulp.task('index', gulp.series('index:docs', 'index:laws'));
// Task to optionally run both clearing and indexing
gulp.task('index:reindex', gulp.series('index:clear', 'index'));
// Default task to run indexing
gulp.task('default', gulp.series('index'));