diff --git a/app/helpers/functions.mts b/app/helpers/functions.mts index a7b3aa62..13907508 100644 --- a/app/helpers/functions.mts +++ b/app/helpers/functions.mts @@ -6,6 +6,7 @@ import markdownIt from 'markdown-it'; import markdownItAttrs from 'markdown-it-attrs'; import momentJs from 'moment-timezone'; import { inspect } from 'util'; +import matter from 'gray-matter'; const moment = momentJs.tz.setDefault("UTC"); const md = markdownIt({ html: true, @@ -77,10 +78,19 @@ const resolveReadmeFile: (directory: string) => string|undefined = (directory) = const directoryContainsReadme = (directory: string) => resolveReadmeFile(directory); // const printMarkdownFile = (file) => { // }; +const readmeFm = (directory: string) => { + const readmeFile = resolveReadmeFile(directory); + if (readmeFile) { + return matter.read(readmeFile).data; + } +}; const printReadme = (directory: string) => { const readmeFile = resolveReadmeFile(directory); if (readmeFile) { - return md.render(fs.readFileSync(readmeFile).toString()); + const fm = matter.read(readmeFile); + const fmData = { fm: fm.data, excerpt: fm.excerpt }; + const content = md.render(fm.content, fmData ); + return content; } }; const stripWebVTT = (webvttText: string) => { @@ -187,6 +197,7 @@ export default { shouldShowSiteWelcomeMessage, shouldOmitLinkOnLastBreadcrumb, directoryContainsReadme, + readmeFm, printReadme, stripWebVTT, renderArchive, diff --git a/app/tika/client.mts b/app/tika/client.mts index 629b0f1a..378d722d 100644 --- a/app/tika/client.mts +++ b/app/tika/client.mts @@ -1,4 +1,4 @@ -import { ReadStream, WriteStream } from 'fs' +import { ReadStream } from 'fs' import fetch from 'node-fetch' let join = (...args: String[]) => { let output = ""; @@ -8,6 +8,7 @@ let join = (...args: String[]) => { return output; }; import { ContentResource, MetadataResource } from './types.mjs' +import { Writable } from 'stream'; export interface TikaClientOptions { host: string @@ -64,7 +65,7 @@ export class TikaClient { return response.body } - async pipe(readStream: ReadStream, writeStream: WriteStream, contentType: TikaContentType = 'text/plain', filename?: string): Promise { + async pipe(readStream: ReadStream, writeStream: Writable, contentType: TikaContentType = 'text/plain', filename?: string): Promise { const tikaStream = await this.getStream(readStream, contentType, filename) return new Promise((resolve, reject) => { const stream = tikaStream.pipe(writeStream) diff --git a/app/views/directory.ejs b/app/views/directory.ejs index 03644cf9..c9cef055 100644 --- a/app/views/directory.ejs +++ b/app/views/directory.ejs @@ -19,6 +19,14 @@ <% if (h.directoryContainsReadme(directory)) {%>
+ <% if (typeof h.readmeFm(directory) !== 'undefined' && h.readmeFm(directory).title) { %> +

<%= h.readmeFm(directory).title %>

+

+ + <%if (typeof h.readmeFm(directory).docDate !== 'undefined') { %>Document Date: <%= h.moment(h.readmeFm(directory).docDate).format('MMMM D, YYYY') %><% } %> + +

+ <% } %> <%- h.printReadme(directory) %>
diff --git a/gulpfile.mts b/gulpfile.mts index 0691534d..4fdae290 100644 --- a/gulpfile.mts +++ b/gulpfile.mts @@ -16,6 +16,9 @@ import { SolrDocument } from './app/search/solr-doc.mjs'; // import ts from 'gulp-typescript'; import run from 'gulp-run'; import { deleteAsync } from 'del'; +import { fileURLToPath } from 'url'; +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); gulp.task('build:typescript:compile', () => { // const tsProject = ts.createProject('tsconfig.build.json'); @@ -229,7 +232,7 @@ gulp.task('index:laws', async () => { // Create a writable stream to capture the extracted text content into a string let extractedText = ''; - const writableStream = new WriteStream({ + const writableStream = new Writable({ write(chunk, encoding, callback) { extractedText += chunk.toString(); // Append the chunk to the extracted text callback(); @@ -271,21 +274,29 @@ gulp.task('index:docs', async () => { //let scanExts = ''; //set to empty string to scan all let scanExts = '.{pdf,docx,pptx,xlsx,jpg,png,txt,mkv}'; let globs = [ - `Amys_Drop_Box/**/*${scanExts}`, - `CRS_Reports/**/*${scanExts}`, - `Mine_Safety_and_Health_Administration/**/*${scanExts}`, - `Potesta_&_Associates/**/*${scanExts}`, - `Russell_County/**/*${scanExts}`, - `Russell_County_Reclamation_LLC/**/*${scanExts}`, - `Tobacco_Region_Revitalization_Commission/**/*${scanExts}`, - `United_Mine_Workers_of_America/**/*${scanExts}`, - `Virginia_Energy/**/*${scanExts}`, + `2024-02-16 FOIA Response/*${scanExts}`, `2024-02-16 FOIA Response/**/*${scanExts}`, + `Amys_Drop_Box/*${scanExts}`, `Amys_Drop_Box/**/*${scanExts}`, + `CRS_Reports/*${scanExts}`, `CRS_Reports/**/*${scanExts}`, + `Cumberland_Plateau_Planning_District_Commission/*${scanExts}`, `Cumberland_Plateau_Planning_District_Commission/**/*${scanExts}`, + `David_Eaton_District_4_-_Supervisor_Screenshot_Collection/*${scanExts}`, `David_Eaton_District_4_-_Supervisor_Screenshot_Collection/**/*${scanExts}`, + `Environmental_Protection_Agency/*${scanExts}`, `Environmental_Protection_Agency/**/*${scanExts}`, + `Gentry_Locke/*${scanExts}`, `Gentry_Locke/**/*${scanExts}`, + `Mine_Safety_and_Health_Administration/*${scanExts}`, `Mine_Safety_and_Health_Administration/**/*${scanExts}`, + `News/*${scanExts}`, `News/**/*${scanExts}`, + `Potesta_&_Associates/*${scanExts}`, `Potesta_&_Associates/**/*${scanExts}`, + `Russell_County/*${scanExts}`, `Russell_County/**/*${scanExts}`, + `Scott_County/*${scanExts}`, `Scott_County/**/*${scanExts}`, + `Tobacco_Region_Revitalization_Commission/*${scanExts}`, `Tobacco_Region_Revitalization_Commission/**/*${scanExts}`, + `United_Mine_Workers_of_America/*${scanExts}`, `United_Mine_Workers_of_America/**/*${scanExts}`, + `Virginia_Energy/*${scanExts}`, `Virginia_Energy/**/*${scanExts}`, + `Virginia_Governor/*${scanExts}`, `Virginia_Governor/**/*${scanExts}`, + `Virginia_Tech/*${scanExts}`, `Virginia_Tech/**/*${scanExts}`, // I want to put Virginia Law in its own search category first. // `Virginia_Law_Library/**/*${scanExts}`, ]; // Use glob to match files in the local directories let files: string[] = []; - let cwd = path.resolve(__dirname, config.publicPath.replaceAll('/', path.sep)); + let cwd = path.resolve(__dirname, '..', config.publicPath.replaceAll('/', path.sep)); globs.forEach(async (globPattern) => { files = files.concat(glob.globSync(globPattern, { cwd, @@ -343,7 +354,7 @@ gulp.task('index:docs', async () => { // Create a Readable stream for the file contents let f = fs.createReadStream(fileFullPath); // Create a writable stream to capture the extracted text content into a string - const writableStream = new WriteStream({ + const writableStream = new Writable({ write(chunk, encoding, callback) { extractedText += chunk.toString(); // Append the chunk to the extracted text callback(); @@ -376,13 +387,13 @@ gulp.task('index:docs', async () => { }; // Save a copy of the text into a file - const fileName = fileFullPath + ".md"; - try { - fs.writeFileSync(fileName, extractedText); - console.log(`File ${fileName} has been written successfully!`); - } catch (err) { - console.error('Error writing to file', err); - } + // const fileName = fileFullPath + ".md"; + // try { + // fs.writeFileSync(fileName, extractedText); + // console.log(`File ${fileName} has been written successfully!`); + // } catch (err) { + // console.error('Error writing to file', err); + // } // Send document to Solr for indexing // Index the file with its text content and metadata diff --git a/one-time-conversion.mts b/one-time-conversion.mts new file mode 100644 index 00000000..df086ff0 --- /dev/null +++ b/one-time-conversion.mts @@ -0,0 +1,223 @@ +/** + * Operation 1: Clear orphan index entries which have no file in public. Log output. Keep this gulp operation for later, call it 'index:clean-missing'. + * + * 1. Connect to Solr. + * 2. Query * for all the documents. Return all the records and all the fields. + * 3. For each of the records, check that the file exists. If the file does not exist on the filesystem, delete the index record. + * + * Operation 2: Save the stored records to the `${pathToFile}.md` file. + * + * 4. Check if the new file exists. If it exists, exit the procedure. + * 5. If it does not exist, then create it. + * 6. Write the contents to the file. Particularly, every file should have a YAML header. In that YAML header I want the following information stored: + * --- + * type: document + * title: document title + * mimeType: text/plain, application/vnd-word, etc.; use known good list + * tags: + * - customized based on path, perhaps one tag per folder name + * path: relative path to document file + * sha256sum: checksum for SHA2-256 checksum + * sha1sum: checksum for SHA1 checksum + * --- + * ...document plain text from the OCR operations. + * + * Operation 3: Save the Solr record without arrays into a second Solr instance. + * + * 7. Connect to second Solr instance. + * 8. Store document correctly according to the new schema without additional cardinality of the search fields. + * 9. Index all information in the .md (frontmatter and plaintext) schema into the Solr schema. + * + * Operation 4: Switch all code to use new Solr schema. + * + * 10. Update references to Solr so they use the new instance. + * + * Operation 5: Convert indexing code so that the Tika->.md file generation occurs in one operation and the .md->Solr scan occurs in another. + * + * 11. Run both operations to reindex the site. + */ + +/** OPERATION 1: */ + +import axios from 'axios'; +import { config } from './app/config.mjs'; +import { SolrDocument, IncorrectStyleSolrDocument } from './app/search/solr-doc.mjs'; +import request from 'request-promise-native'; +import fs from 'fs'; +import path from 'path'; +import fm from 'gray-matter'; + +// Function to retrieve metadata of a file from Solr +async function getAllSolrDocuments(start: number = 0, rows: number = 10) { + const res = await axios({ method: 'get', url: `${config.solrDocsUrl}/select?q=*:*&start=${start}&rows=${rows}`, responseType: 'json' }) + return res.data.response.docs; +} + +async function removeSolrDocument(id: string) { + const res = await axios({ method: 'post', url: `${config.solrDocsUrl}/update/json`, responseType: 'json', data: { commit: {}, delete: [ id ] } }); + return res.data; +} + +const existingSolrDocuments: IncorrectStyleSolrDocument[] = (await getAllSolrDocuments(0, 65535)); +const validSolrDocuments: StoredSolrDocument[] = []; +const stripLeftText = "https://no-moss-3-carbo-landfill-library.online/"; + +interface StoredSolrDocument extends SolrDocument { + relativePathToFile: string; + absolutePathToFile: string; + doesExist: boolean; +}; + +for (const oldSolrDoc of existingSolrDocuments) { + const solrDoc: StoredSolrDocument = { + id: oldSolrDoc.id, + content_length: oldSolrDoc.content_length && oldSolrDoc.content_length.length ? oldSolrDoc.content_length[0] : 0, + content_type: oldSolrDoc.content_type && oldSolrDoc.content_type.length ? oldSolrDoc.content_type[0] : '', + sha256sum: oldSolrDoc.sha256sum && oldSolrDoc.sha256sum.length ? oldSolrDoc.sha256sum[0] : '', + text: oldSolrDoc.text && oldSolrDoc.text.length ? oldSolrDoc.text[0] : '', + url: oldSolrDoc.url && oldSolrDoc.url.length ? oldSolrDoc.url[0] : '', + _version_: oldSolrDoc._version_, + relativePathToFile: '', + absolutePathToFile: '', + doesExist: false, + }; + solrDoc.relativePathToFile = (solrDoc.url.startsWith(stripLeftText) ? solrDoc.url.substring(stripLeftText.length) : solrDoc.url).replaceAll('/', path.sep); + solrDoc.absolutePathToFile = path.resolve(path.join(config.publicPath, solrDoc.relativePathToFile)); + solrDoc.doesExist = fs.existsSync(solrDoc.absolutePathToFile); + // build the correct object + if (!solrDoc.doesExist) { + await removeSolrDocument(solrDoc.id).then((response) => { + console.log(`Removed doc from search index because the file doesn't exist on the volume:`); + console.log(` id: ${solrDoc.id}`); + console.log(` url: ${solrDoc.url}`); + console.log(` path: ${solrDoc.absolutePathToFile}`); + console.log(` Response:`, response); + }) + } + else { + validSolrDocuments.push(solrDoc); + } +}; + +/** OPERATION 2: */ +import moment from 'moment'; +import { mkdir } from 'fs/promises'; +function resolveTitleFromFileName(fileName: string) { + return path.basename(fileName, path.extname(fileName)).replaceAll('_', ' ') +} + +function resolveDateFromFileName(fileName: string) { + // style 1: convert: 2024-06-19_00_25_00 to: 2024-06-19 12:25:00 am + const m1 = 'YYYY-MM-DD hh:mm:ss a'; + const re1 = /^(\d{4})-(\d{2})-(\d{2})_(\d{2})_(\d{2})_(\d{2}).*/g; + // style 2: convert: 2024-06-19_00_25 to: 2024-06-19 12:25:00 am + const m2 = m1; + const re2 = /^(\d{4})-(\d{2})-(\d{2})_(\d{2})_(\d{2}).*/g; + // style 3: convert: 2024-06-19 to: 2024-06-19 + const re3 = /^(\d{4})-(\d{2})-(\d{2}).*/g; + // style 4: convert: 2024-06 to: 2024-06 + const re4 = /^(\d{4})-(\d{2}).*/g; + // style 5: convert: 2024 to: 2024 + const re5 = /^(\d{4}).*/g; + if (fileName.search(re1) >= 0) { + return moment(fileName.replace(re1, `$1-$2-$3 $4:$5:$6`)).format(m1); + } + else if (fileName.search(re2) >= 0) { + return moment(fileName.replace(re2, `$1-$2-$3 $4:$5`)).format(m2); + } + else if (fileName.search(re3) >= 0) { + return fileName.replace(re3, `$1-$2-$3`); + } + else if (fileName.search(re4) >= 0) { + return fileName.replace(re4, `$1-$2`); + } + else if (fileName.search(re5) >= 0) { + return fileName.replace(re5, `$1`); + } +} + +function resolveTagsFromRelativeFilePath(relFilePath: string) { + let tags = relFilePath.split(path.sep); + const fileName = tags.pop()!; + const dateTag = resolveDateFromFileName(fileName!); + if (dateTag) { + // style 3: convert: 2024-06-19 to: 2024-06-19 + const re3 = /^(\d{4})-(\d{2})-(\d{2}).*/g; + // style 4: convert: 2024-06 to: 2024-06 + const re4 = /^(\d{4})-(\d{2}).*/g; + // style 5: convert: 2024 to: 2024 + const re5 = /^(\d{4}).*/g; + if (fileName.search(re5) >= 0) { + tags.push(fileName.replace(re5, `$1`)); + } + if (fileName.search(re4) >= 0) { + tags.push(fileName.replace(re4, `$1-$2`)); + } + if (fileName.search(re3) >= 0) { + tags.push(fileName.replace(re3, `$1-$2-$3`)); + } + } + return tags; +} + +import crypto from 'crypto'; +async function calculateSHA1Hash(filePath: string) { + return new Promise((resolve, reject) => { + const readStream = fs.createReadStream(filePath); + const hash = crypto.createHash('SHA1'); + + readStream.on('data', (chunk) => { + hash.update(chunk); + }); + + readStream.on('end', () => { + const sha1Hash: string = hash.digest('hex'); + resolve(sha1Hash); + }); + + readStream.on('error', (error) => { + reject(error); + }); + }); +} + +for (const solrDoc of validSolrDocuments) { + //guard: don't write this file for video files, all of which conveniently contain the string YouTube_Archive + if (solrDoc.relativePathToFile.indexOf('YouTube_Archive') >= 0 || solrDoc.relativePathToFile.indexOf('2024-03-20-Governor_Youngkin_Listening_Session') >= 0) { + console.log(`This Solr document is for a YouTube_Archive video, which uses a different metadata technique. Skipping creation of markdown file in folder.`); + continue; + } + //guard: don't write this + const mdFolderPath = path.join(path.dirname(solrDoc.absolutePathToFile), path.basename(solrDoc.absolutePathToFile, path.extname(solrDoc.absolutePathToFile))); + if (fs.existsSync(mdFolderPath)) { + console.log(`Directory at ${mdFolderPath} already exists. Continuing check for markdown file in folder.`); + } + else { + //create folder + await mkdir(mdFolderPath); + console.log(`Directory at ${mdFolderPath} created.`); + } + const mdFilePath = path.join(mdFolderPath, 'README.md'); + // if (fs.existsSync(mdFilePath)) { + // console.log(`Markdown file at ${mdFilePath} already exists. Skipping creation of markdown file in folder.`); + // continue; + // } + // else { + console.log(solrDoc.id); + console.log(`Saving new markdown file at ${mdFilePath}:`); + const mdFileContents = fm.stringify(solrDoc.text, { + type: 'document', + title: resolveTitleFromFileName(solrDoc.absolutePathToFile), + file: path.posix.join('..', path.basename(solrDoc.absolutePathToFile)), + tags: resolveTagsFromRelativeFilePath(solrDoc.relativePathToFile), + docDate: resolveDateFromFileName(path.basename(solrDoc.absolutePathToFile))||null, + contentType: solrDoc.content_type, + contentLength: solrDoc.content_length, + sha256sum: solrDoc.sha256sum, + sha1sum: await calculateSHA1Hash(solrDoc.absolutePathToFile), + }); + let ws = fs.createWriteStream(mdFilePath); + ws.write(mdFileContents); + ws.close(); + // } +} diff --git a/package.json b/package.json index c5a1e54a..09c6c122 100644 --- a/package.json +++ b/package.json @@ -10,10 +10,10 @@ "test": "echo \"Error: no test specified\" && exit 1", "transpile:ts": "tsc -project tsconfig.pre-build.json", "index": "tsc -project tsconfig.pre-build.json && gulp -f dist/gulpfile.mjs index", - "index:clear": "tsc -project tsconfig.json && gulp -f dist/gulpfile.mjs index:clear", - "index:docs": "tsc -project tsconfig.json && gulp -f dist/gulpfile.mjs index:docs", - "index:laws": "tsc -project tsconfig.json && gulp -f dist/gulpfile.mjs index:laws", - "index:reindex": "tsc -project tsconfig.json && gulp -f dist/gulpfile.mjs index:reindex" + "index:clear": "tsc -project tsconfig.pre-build.json && gulp -f dist/gulpfile.mjs index:clear", + "index:docs": "tsc -project tsconfig.pre-build.json && gulp -f dist/gulpfile.mjs index:docs", + "index:laws": "tsc -project tsconfig.pre-build.json && gulp -f dist/gulpfile.mjs index:laws", + "index:reindex": "tsc -project tsconfig.pre-build.json && gulp -f dist/gulpfile.mjs index:reindex" }, "author": "", "license": "ISC",