import gulp from 'gulp'; import request from 'request-promise-native'; import axios from 'axios'; import { glob } from 'glob'; import fs, { WriteStream } from 'fs'; import path from 'path'; import crypto from 'crypto'; import url from 'url'; import { TikaClient } from './app/tika/index.mjs'; import { Readable, Writable } from 'stream'; import dotenv from 'dotenv'; import dotenvExpand from 'dotenv-expand'; import process from 'process'; import { config } from './app/config.mjs'; import { SolrDocument } from './app/search/solr-doc.mjs'; // import ts from 'gulp-typescript'; import run from 'gulp-run'; import { deleteAsync } from 'del'; import { fileURLToPath } from 'url'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); gulp.task('build:typescript:compile', () => { // const tsProject = ts.createProject('tsconfig.build.json'); // return tsProject.src().pipe(tsProject()).js.pipe(gulp.dest('../dist')); return run('tsc -project tsconfig.build.json', { verbosity: 3 }).exec(); }); gulp.task('build:clean', async () => { return await deleteAsync(['../dist/**/*', '!../dist/gulpfile.mjs', '!../dist/gulpfile.d.mts']); }); gulp.task('build:copy:tsconfig', () => { return gulp.src(['../tsconfig.build.json', '../tsconfig.json']).pipe(gulp.dest('../dist')); }); gulp.task('build:copy:assets', async () => { return await gulp.src('../assets/**/*').pipe(gulp.dest('../dist/assets')); }); gulp.task('build:copy:pages', async () => { return await gulp.src('../pages/**/*').pipe(gulp.dest('../dist/pages')) }); gulp.task('build:copy:views', async () => { return await gulp.src('../app/views/**/*').pipe(gulp.dest('../dist/views')) }); gulp.task('build', gulp.series('build:clean', 'build:copy:tsconfig', 'build:copy:pages', 'build:copy:assets', 'build:copy:views', 'build:typescript:compile')); // Task to clear out previous Solr data gulp.task('index:clear', async () => { await request({ uri: `${config.solrDocsUrl}/update?commit=true`, method: 'POST', body: { delete: { query: '*:*' } }, // Delete all documents json: true, }); }); // gulp.task('dbfromsolr', async () => { // let docs = await request({ // uri: `${config.solrDocsUrl}/select`, // qs: { // q: '*:*', // wt: 'json', // rows: 10000, // }, // json: true // }); // docs = docs?.response?.docs?.map(({id, sha256sum, url, content_length, content_type, text, _version_}: SolrDocument) => // { // return { // id, // url: url.join(''), // content_length: parseInt(content_length.join()), // sha256sum: sha256sum.join(''), // content_type: content_type.join(''), // text: text.join(''), // _version_, // } // }).map(doc => { // }) // }); async function calculateSHA256Hash(filePath: string) { return new Promise((resolve, reject) => { const readStream = fs.createReadStream(filePath); const hash = crypto.createHash('sha256'); readStream.on('data', (chunk) => { hash.update(chunk); }); readStream.on('end', () => { const sha256Hash: string = hash.digest('hex'); resolve(sha256Hash); }); readStream.on('error', (error) => { reject(error); }); }); } // Function to retrieve metadata of a file from Solr async function retrieveVirginiaLawMetadataFromSolr(url: string) { // Retrieve metadata from Solr based on the file URL or unique identifier // const response = await axios.get(`${solrUrl}/select?q=id:"${encodeURIComponent(url)}"&fl=${encodeURIComponent('sha256sum, content_length')}`, { // responseType: 'json' // }); const fl = encodeURIComponent("sha256sum, content_length"); const q = encodeURIComponent("id:")+"\""+encodeURIComponent(url)+"\"";//encodeURIComponent(`id:"${url}"`); const uri = `${config.solrLawUrl}/select?q=${q}&fl=${fl}`; const response = await request({ uri: `${uri}`, json: true }); return response && response.response && response.response.docs && response.response.docs[0]; } // Function to retrieve metadata of a file from Solr async function retrieveMetadataFromSolr(url: string) { // Retrieve metadata from Solr based on the file URL or unique identifier // const response = await axios.get(`${solrUrl}/select?q=id:"${encodeURIComponent(url)}"&fl=${encodeURIComponent('sha256sum, content_length')}`, { // responseType: 'json' // }); const fl = encodeURIComponent("sha256sum, content_length"); const q = encodeURIComponent("id:")+"\""+encodeURIComponent(url)+"\"";//encodeURIComponent(`id:"${url}"`); const uri = `${config.solrDocsUrl}/select?q=${q}&fl=${fl}`; const response = await request({ uri: `${uri}`, json: true }); return response && response.response && response.response.docs && response.response.docs[0]; } async function indexDocumentInSolr(document: SolrDocument) { try { // Send document to Solr using the Solr REST API or a Solr client library // Example code to send document using Axios: await axios.post(config.solrDocsUrl + '/update/json/docs', document, { params: { commit: true, // Commit changes immediately }, }); } catch (error) { if (error && (error as Error).message) throw new Error('Error indexing document in Solr: ' + (error as Error).message); } } async function indexLawDocumentInSolr(document: SolrDocument) { try { // Send document to Solr using the Solr REST API or a Solr client library // Example code to send document using Axios: await axios.post(config.solrLawUrl + '/update/json/docs', document, { params: { commit: true, // Commit changes immediately }, }); } catch (error) { if (error && (error as Error).message) throw new Error('Error indexing document in Solr: ' + (error as Error).message); } } function extToMime(fileName: string) { switch (path.extname(fileName)) { case '.htm': case '.html': return 'text/html'; case '.pdf': return 'application/pdf'; case '.md': case '.txt': case '.mkv': return 'video/x-matroska'; default: return 'text/plain'; } } // Task to index files into Solr gulp.task('index:laws', async () => { //let scanExts = ''; //set to empty string to scan all let scanExts = '.{pdf,docx,pptx,xlsx,jpg,png,txt}'; let globs = [ `Russell_County/Ordinances/**/*${scanExts}`, `Virginia_Law_Library/**/*${scanExts}`, ]; // Use glob to match files in the local directories let files: string[] = []; let cwd = path.resolve(__dirname, config.publicPath.replaceAll('/', path.sep)); globs.forEach(async (globPattern) => { files = files.concat(glob.globSync(globPattern, { cwd, matchBase: true, follow: true, })); }); console.log(`Found ${files.length} files to index using ${globs.length} glob patterns.`); // Loop through each file and process them for (let f = 0; f < files.length; f++) { const file = files[f]; console.log(`${f+1}/${files.length}: ${file}`); const fileFullPath = path.join(cwd, file); const url = `${config.siteUrl}/${file.replaceAll(path.sep, '/')}`; console.log('URL: ' + url); // Retrieve metadata of the file from Solr (if it exists) const metadata = await retrieveVirginiaLawMetadataFromSolr(url); // Calculate file size const stats = fs.statSync(fileFullPath); const fileSize = stats.size; // Calculate SHA256 checksum // const checksum = crypto.createHash('sha256').update(fileContents).digest('hex'); const checksum = await calculateSHA256Hash(fileFullPath); // Compare metadata if (!metadata || parseInt(metadata.content_length[0]) != fileSize || metadata.sha256sum[0] != checksum) { // Metadata mismatch or file not found in Solr, proceed with indexing console.log(`Processing text from file using Tika.`); const client = new TikaClient({ host: config.tikaUrl }); const version = await client.getVersion(); console.info(`Tika Server Version: ${version}`); // Create a Readable stream for the file contents let f = fs.createReadStream(fileFullPath); // Create a writable stream to capture the extracted text content into a string let extractedText = ''; const writableStream = new Writable({ write(chunk, encoding, callback) { extractedText += chunk.toString(); // Append the chunk to the extracted text callback(); } }); // Use the TikaClient's pipe method to extract text content await client.pipe(f, writableStream, 'text/plain', encodeURI(path.basename(file))); console.log("Extracted Text:", extractedText); // Create Solr document const solrDocument: SolrDocument = { id: url, // Replace with a unique identifier for the document text: extractedText, // Add the extracted text content sha256sum: checksum, // Add the checksum //html: response.data, url: url, content_length: fileSize, content_type: extToMime(url), // Add additional fields as needed (e.g., title, author, etc.) }; // Send document to Solr for indexing // Index the file with its text content and metadata console.log(`Indexing ${url}`); await indexLawDocumentInSolr(solrDocument); // Continue console.log(`Done.`); } else { // Metadata matches, skip the file console.log(`Skipping file '${file}' as metadata matches existing metadata in Solr index.`); } } }); // Task to index files into Solr gulp.task('index:docs', async () => { //let scanExts = ''; //set to empty string to scan all let scanExts = '.{pdf,docx,pptx,xlsx,jpg,png,txt,mkv}'; let globs = [ `2024-02-16 FOIA Response/*${scanExts}`, `2024-02-16 FOIA Response/**/*${scanExts}`, `Amys_Drop_Box/*${scanExts}`, `Amys_Drop_Box/**/*${scanExts}`, `CRS_Reports/*${scanExts}`, `CRS_Reports/**/*${scanExts}`, `Cumberland_Plateau_Planning_District_Commission/*${scanExts}`, `Cumberland_Plateau_Planning_District_Commission/**/*${scanExts}`, `David_Eaton_District_4_-_Supervisor_Screenshot_Collection/*${scanExts}`, `David_Eaton_District_4_-_Supervisor_Screenshot_Collection/**/*${scanExts}`, `Environmental_Protection_Agency/*${scanExts}`, `Environmental_Protection_Agency/**/*${scanExts}`, `Gentry_Locke/*${scanExts}`, `Gentry_Locke/**/*${scanExts}`, `Mine_Safety_and_Health_Administration/*${scanExts}`, `Mine_Safety_and_Health_Administration/**/*${scanExts}`, `News/*${scanExts}`, `News/**/*${scanExts}`, `Potesta_&_Associates/*${scanExts}`, `Potesta_&_Associates/**/*${scanExts}`, `Russell_County/*${scanExts}`, `Russell_County/**/*${scanExts}`, `Scott_County/*${scanExts}`, `Scott_County/**/*${scanExts}`, `Tobacco_Region_Revitalization_Commission/*${scanExts}`, `Tobacco_Region_Revitalization_Commission/**/*${scanExts}`, `United_Mine_Workers_of_America/*${scanExts}`, `United_Mine_Workers_of_America/**/*${scanExts}`, `Virginia_Energy/*${scanExts}`, `Virginia_Energy/**/*${scanExts}`, `Virginia_Governor/*${scanExts}`, `Virginia_Governor/**/*${scanExts}`, `Virginia_Tech/*${scanExts}`, `Virginia_Tech/**/*${scanExts}`, // I want to put Virginia Law in its own search category first. // `Virginia_Law_Library/**/*${scanExts}`, ]; // Use glob to match files in the local directories let files: string[] = []; let cwd = path.resolve(__dirname, '..', config.publicPath.replaceAll('/', path.sep)); globs.forEach(async (globPattern) => { files = files.concat(glob.globSync(globPattern, { cwd, matchBase: true, follow: true, })); }); console.log(`Found ${files.length} files to index using ${globs.length} glob patterns.`); // Loop through each file and process them for (let f = 0; f < files.length; f++) { const file = files[f]; console.log(`${f+1}/${files.length}: ${file}`); const fileFullPath = path.join(cwd, file); let url = `${config.siteUrl}/${file.replaceAll(path.sep, '/')}`; console.log('URL: ' + url); // Retrieve metadata of the file from Solr (if it exists) const metadata = await retrieveMetadataFromSolr(url); // Calculate file size const stats = fs.statSync(fileFullPath); const fileSize = stats.size; // Calculate SHA256 checksum // const checksum = crypto.createHash('sha256').update(fileContents).digest('hex'); const checksum = await calculateSHA256Hash(fileFullPath); // Compare metadata if (!metadata || parseInt(metadata.content_length[0]) != fileSize || metadata.sha256sum[0] != checksum) { // Metadata mismatch or file not found in Solr, proceed with indexing console.log(`Processing text from file using Tika.`); const client = new TikaClient({ host: config.tikaUrl }); const version = await client.getVersion(); console.info(`Tika Server Version: ${version}`); let extractedText = ''; let subtitleExt = ".en.vtt"; if (url.endsWith(".webm") || url.endsWith(".mkv") || url.endsWith(".mpg") || url.endsWith(".mpeg") || url.endsWith(".mp4")) { let subtitleFilePath = fileFullPath.substring(0, fileFullPath.lastIndexOf('.')) + subtitleExt; if (fs.existsSync(subtitleFilePath)) { console.log("Found VTT subtitle file at:", subtitleFilePath); extractedText = fs.readFileSync(subtitleFilePath, 'utf8'); url = url.substring(0, url.lastIndexOf('/')+1); } else { console.log("No subtitles found at: ", subtitleFilePath); console.log("Skipping this video file. Not adding this to the index until subtitles are available.") continue; } } else { // Create a Readable stream for the file contents let f = fs.createReadStream(fileFullPath); // Create a writable stream to capture the extracted text content into a string const writableStream = new Writable({ write(chunk, encoding, callback) { extractedText += chunk.toString(); // Append the chunk to the extracted text callback(); } }); // Use the TikaClient's pipe method to extract text content await client.pipe(f, writableStream, 'text/plain', encodeURI(path.basename(file))); } if (!extractedText) { console.log("Skipping document because no text was detected."); continue; } else if (extractedText.length < 100) { console.log("Extracted Text:", extractedText); } else { console.log("Extracted Text (excerpt):", extractedText.substring(0, 99)); } // Create Solr document const solrDocument = { id: url, // Replace with a unique identifier for the document text: extractedText, // Add the extracted text content sha256sum: checksum, // Add the checksum //html: response.data, url: url, content_length: fileSize, content_type: extToMime(url), // Add additional fields as needed (e.g., title, author, etc.) }; // Save a copy of the text into a file // const fileName = fileFullPath + ".md"; // try { // fs.writeFileSync(fileName, extractedText); // console.log(`File ${fileName} has been written successfully!`); // } catch (err) { // console.error('Error writing to file', err); // } // Send document to Solr for indexing // Index the file with its text content and metadata console.log(`Indexing ${url}`); await indexDocumentInSolr(solrDocument); // Continue console.log(`Done.`); } else { console.log('prior metadata:', metadata); // Save a copy of the text into a file // const fileName = fileFullPath + ".md"; // try { // fs.writeFileSync(fileName, extractedText); // console.log(`File ${fileName} has been written successfully!`); // } catch (err) { // console.error('Error writing to file', err); // } // Metadata matches, skip the file console.log(`Skipping file '${file}' as metadata matches existing metadata in Solr index.`); } } }); // Default task to run indexing gulp.task('index', gulp.series('index:docs', 'index:laws')); // Task to optionally run both clearing and indexing gulp.task('index:reindex', gulp.series('index:clear', 'index')); // Default task to run indexing gulp.task('default', gulp.series('index'));