Added defunct code in index folder.
This commit is contained in:
parent
5336f9aa73
commit
82450a9355
115
index/build-meetings-dir.js
Normal file
115
index/build-meetings-dir.js
Normal file
|
@ -0,0 +1,115 @@
|
||||||
|
const path = require('path');
|
||||||
|
let meetingDocs = require('../bos-tourism-meetings.json');
|
||||||
|
let meetings = [];
|
||||||
|
let meetingsFiles = [];
|
||||||
|
const yaml = require('js-yaml');
|
||||||
|
const cheerio = require('cheerio');
|
||||||
|
|
||||||
|
function convertTimeStr(hh_mm_ampm) {
|
||||||
|
if (!hh_mm_ampm) return "";
|
||||||
|
let space_parts = hh_mm_ampm.split(" ");
|
||||||
|
let time_parts = space_parts[0].split(":");
|
||||||
|
time_parts = time_parts.map((time_part) => {
|
||||||
|
return parseInt(time_part);
|
||||||
|
})
|
||||||
|
if (space_parts[1] == "PM") time_parts[0] += 12;
|
||||||
|
return (time_parts[0]<10?"0":"") + time_parts[0] + "_" + (time_parts[1]<10?"0":"") + time_parts[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
function getAgencyLongName(agency) {
|
||||||
|
switch (agency) {
|
||||||
|
case "Russell_BOS": return "Russell County Board of Supervisors";
|
||||||
|
case "Russell_Tourism": return "Russell County Tourism Advisory Committee";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function getKeyName(doc) {
|
||||||
|
if (doc.Agency == "BOS") doc.Agency = "Russell_BOS";
|
||||||
|
else if (doc.Agency == "Tourism") doc.Agency = "Russell_Tourism";
|
||||||
|
return doc["MeetingDate"] + "_" + convertTimeStr(doc["MeetingTime"])+ "-" + doc["Agency"] + "-" + doc["MeetingName"].replaceAll(" ", "_");
|
||||||
|
}
|
||||||
|
|
||||||
|
function getMeetingFilePath(keyName) {
|
||||||
|
return path.join("..", "astro", "src", "content", "meetings", keyName + ".md");
|
||||||
|
}
|
||||||
|
|
||||||
|
// create meetings dictionary
|
||||||
|
meetingDocs.forEach(doc => {
|
||||||
|
let keyName = getKeyName(doc);
|
||||||
|
let fileName = getMeetingFilePath(keyName);
|
||||||
|
if (meetingsFiles.indexOf(keyName) == -1) {
|
||||||
|
meetingsFiles.push({
|
||||||
|
fileName,
|
||||||
|
mdContent: {
|
||||||
|
frontMatter: {
|
||||||
|
title: getAgencyLongName(doc.Agency) + " - " + doc.MeetingName + " - " + doc.MeetingDate + " @ " + doc.MeetingTime + "",
|
||||||
|
meetingDate: doc.MeetingDate,
|
||||||
|
attachments: [],
|
||||||
|
},
|
||||||
|
content: "",
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// go back through and add all of the documents to the meetings in the files
|
||||||
|
meetingsFiles = meetingsFiles.map(file => {
|
||||||
|
meetingDocs.forEach(doc => {
|
||||||
|
let keyName = getKeyName(doc);
|
||||||
|
let fileName = getMeetingFilePath(keyName);
|
||||||
|
if (fileName == file.fileName) {
|
||||||
|
file.mdContent.frontMatter.attachments.push({
|
||||||
|
title: doc.DocumentCategory,
|
||||||
|
relativePath: doc.Host + doc.AgendaPath + doc.AgendaFile.replace('\'', ''),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return file;
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
const trimSlashes = (str) => str.replace(/^\/|\/$/g, '');
|
||||||
|
|
||||||
|
function getFileExt(file) {
|
||||||
|
let ext = path.extname(file.replaceAll('\'', ''));
|
||||||
|
if (!ext) {
|
||||||
|
if (file.endsWith("RC Tourism Committee Meeting Minutes - July 18 2017 Regular")) {
|
||||||
|
return ".pdf";
|
||||||
|
}
|
||||||
|
if (file.endsWith('_05222023-162') || file.endsWith('_03062023-33') || file.endsWith('_03062023-157')) {
|
||||||
|
return ".docx";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ext;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log("@echo off");
|
||||||
|
meetingDocs.forEach(doc => {
|
||||||
|
let keyName = getKeyName(doc);
|
||||||
|
let fileName = getMeetingFilePath(keyName);
|
||||||
|
console.log(
|
||||||
|
"copy " +
|
||||||
|
" \"" +
|
||||||
|
path.join(
|
||||||
|
"..",
|
||||||
|
"..",
|
||||||
|
"Web_Site_Archives",
|
||||||
|
"Russell_County_Web_Site-latest",
|
||||||
|
"russellcountyva.us",
|
||||||
|
doc.Host,
|
||||||
|
trimSlashes(doc.AgendaPath).replaceAll("/", path.sep),
|
||||||
|
doc.AgendaFile.replaceAll('\'', ''),
|
||||||
|
) +
|
||||||
|
"\" \"" +
|
||||||
|
getKeyName(doc) + "-" + doc.DocumentCategory.replaceAll(" ", "_") +
|
||||||
|
getFileExt(doc.AgendaFile) +
|
||||||
|
"\" " +
|
||||||
|
" >> copy-files.log"
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
// meetingsFiles.forEach(file => {
|
||||||
|
// //console.log("cp " file.fileName);
|
||||||
|
// // console.log("---\n"+yaml.dump(file.mdContent.frontMatter)+"\n---");
|
||||||
|
// });
|
||||||
|
|
144
index/example_tika_upload.js
Normal file
144
index/example_tika_upload.js
Normal file
|
@ -0,0 +1,144 @@
|
||||||
|
const axios = require('axios');
|
||||||
|
const fs = require('fs');
|
||||||
|
const cheerio = require('cheerio');
|
||||||
|
|
||||||
|
// Read the list of files from the text file
|
||||||
|
const fileLines = fs.readFileSync('file_list.txt', 'utf8').split('\n');
|
||||||
|
|
||||||
|
// Filter the list to include only files with certain file extensions
|
||||||
|
const allowedExtensions = ['.txt', '.html', '.htm', '.md', '.pdf']; // Add more extensions as needed
|
||||||
|
const filesToIndex = fileLines.filter(line => {
|
||||||
|
const extension = line.substring(line.lastIndexOf('.')).toLowerCase();
|
||||||
|
return allowedExtensions.includes(extension);
|
||||||
|
});
|
||||||
|
|
||||||
|
async function downloadFile(url, filePath) {
|
||||||
|
const writer = fs.createWriteStream(filePath);
|
||||||
|
|
||||||
|
const response = await axios({
|
||||||
|
url,
|
||||||
|
method: 'GET',
|
||||||
|
responseType: 'stream'
|
||||||
|
});
|
||||||
|
|
||||||
|
response.data.pipe(writer);
|
||||||
|
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
writer.on('finish', resolve);
|
||||||
|
writer.on('error', reject);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function extractAndIndexWithTika(url, solrUrl) {
|
||||||
|
try {
|
||||||
|
const tempFilePath = 'tempfile';
|
||||||
|
console.log(`Downloading ${url}`);
|
||||||
|
await downloadFile(url, tempFilePath);
|
||||||
|
console.log(`Downloaded ${url}.`);
|
||||||
|
|
||||||
|
// Read file contents
|
||||||
|
const fileData = fs.readFileSync(tempFilePath);
|
||||||
|
|
||||||
|
// Make request to Tika
|
||||||
|
const response = await axios.put(tikaUrl, fileData, {
|
||||||
|
headers: {
|
||||||
|
// 'Content-Type': 'application/octet-stream',
|
||||||
|
'Content-Type': 'application/pdf',
|
||||||
|
'X-Tika-Output-Format': 'solr',
|
||||||
|
'X-Tika-SolrUrl': solrUrl
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('Tika response:', response.data);
|
||||||
|
// Parse XML response from Tika
|
||||||
|
|
||||||
|
const textContent = sanitizeIndexData(extractTextFromHtml(response.data));
|
||||||
|
|
||||||
|
// Create Solr document
|
||||||
|
const solrDocument = {
|
||||||
|
id: documentUrl, // Replace with a unique identifier for the document
|
||||||
|
text: textContent, // Add the extracted text content
|
||||||
|
html: response.data,
|
||||||
|
url: url,
|
||||||
|
content_length: textContent.length,
|
||||||
|
content_type: "application/pdf",
|
||||||
|
// Add additional fields as needed (e.g., title, author, etc.)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Send document to Solr for indexing
|
||||||
|
await indexDocumentInSolr(solrDocument);
|
||||||
|
|
||||||
|
console.log('Document indexed successfully:', solrDocument.id);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error extracting text with Tika:', error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractTextFromHtml(html) {
|
||||||
|
// Parse HTML using Cheerio
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
|
// Extract text content from HTML
|
||||||
|
const textContent = $('body').text().trim();
|
||||||
|
|
||||||
|
return textContent;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function indexDocumentInSolr(document) {
|
||||||
|
try {
|
||||||
|
// Send document to Solr using the Solr REST API or a Solr client library
|
||||||
|
// Example code to send document using Axios:
|
||||||
|
await axios.post(solrUrl + '/update/json/docs', document, {
|
||||||
|
params: {
|
||||||
|
commit: true, // Commit changes immediately
|
||||||
|
},
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
throw new Error('Error indexing document in Solr: ' + error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function sanitizeIndexData(data) {
|
||||||
|
// Convert all whitespace characters to spaces
|
||||||
|
let sanitizedData = data.replace(/\s+/g, ' ');
|
||||||
|
|
||||||
|
// Remove double whitespace recursively
|
||||||
|
while (sanitizedData !== (sanitizedData = sanitizedData.replace(/ /g, ' '))) {}
|
||||||
|
|
||||||
|
return sanitizedData.trim(); // Trim leading and trailing spaces
|
||||||
|
}
|
||||||
|
|
||||||
|
async function clearSolrIndex() {
|
||||||
|
try {
|
||||||
|
// Send delete query to Solr to delete all documents
|
||||||
|
const response = await axios.post(solrUrl + '/update', {
|
||||||
|
delete: {
|
||||||
|
query: '*:*'
|
||||||
|
},
|
||||||
|
commit: {}
|
||||||
|
}, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('Deleted ' + response.data.responseHeader.status + ' documents');
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error clearing Solr index:', error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Example usage
|
||||||
|
const tikaUrl = 'http://solr.services.cleveland.daball.me:9998/tika'; // URL of the Tika instance
|
||||||
|
const baseUrl = 'https://russell-county-archives.daball.me'; // URL of the document to download and index
|
||||||
|
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance
|
||||||
|
|
||||||
|
// Call the function to clear the Solr index
|
||||||
|
clearSolrIndex();
|
||||||
|
|
||||||
|
for (let l = 0; l < filesToIndex.length; l++) {
|
||||||
|
let line = filesToIndex[l];
|
||||||
|
let documentUrl = baseUrl + line;
|
||||||
|
extractAndIndexWithTika(documentUrl, solrUrl);
|
||||||
|
}
|
22431
index/file_list.txt
Normal file
22431
index/file_list.txt
Normal file
File diff suppressed because it is too large
Load Diff
288
index/gulpfile backup copy.js
Normal file
288
index/gulpfile backup copy.js
Normal file
|
@ -0,0 +1,288 @@
|
||||||
|
const gulp = require('gulp');
|
||||||
|
//const shell = require('gulp-shell');
|
||||||
|
const axios = require('axios');
|
||||||
|
const fs = require('fs');
|
||||||
|
const os = require ("os");
|
||||||
|
const path = require('path');
|
||||||
|
const crypto = require('crypto');
|
||||||
|
const cheerio = require('cheerio');
|
||||||
|
const SftpClient = require('ssh2-sftp-client');
|
||||||
|
const { TikaClient } = require('tika-js');
|
||||||
|
|
||||||
|
const baseUrl = 'https://russell-county-archives.daball.me'; // URL of the document to download and index
|
||||||
|
const sftpBasePath = 'david@caddy.services.cleveland.daball.me:/srv/www/russell-county-archives.daball.me/archives'; // SSH path
|
||||||
|
const tikaUrl = 'http://solr.services.cleveland.daball.me:9998'; // URL of the Tika instance
|
||||||
|
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance
|
||||||
|
|
||||||
|
// Read the list of files from the text file
|
||||||
|
const fileLines = fs.readFileSync('file_list.txt', 'utf8').split('\n');
|
||||||
|
|
||||||
|
// Filter the list to include only files with certain file extensions
|
||||||
|
const allowedExtensions = ['.txt', '.html', '.htm', '.md', '.pdf'];
|
||||||
|
const filesToIndex = fileLines.filter(line => {
|
||||||
|
const extension = line.substring(line.lastIndexOf('.')).toLowerCase();
|
||||||
|
return allowedExtensions.includes(extension);
|
||||||
|
});
|
||||||
|
|
||||||
|
function extToMime(file_name) {
|
||||||
|
switch (path.extname(file_name)) {
|
||||||
|
case '.htm':
|
||||||
|
case '.html':
|
||||||
|
return 'text/html';
|
||||||
|
case '.pdf':
|
||||||
|
return 'application/pdf';
|
||||||
|
case '.md':
|
||||||
|
case '.txt':
|
||||||
|
default:
|
||||||
|
return 'text/plain';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const readSshRsaKey = (keyFilePath) => {
|
||||||
|
try {
|
||||||
|
// Read the contents of the SSH RSA key file
|
||||||
|
const key = fs.readFileSync(keyFilePath, 'utf8');
|
||||||
|
return key.trim(); // Trim whitespace from the key
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error reading SSH RSA key:', error);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Example usage
|
||||||
|
const defaultKeyFilePath = path.join(os.homedir(), ".ssh", "id_rsa");
|
||||||
|
const sshRsaKey = readSshRsaKey(defaultKeyFilePath);
|
||||||
|
|
||||||
|
// const downloadFileWithRsync = (sourceUrl, destinationPath) => {
|
||||||
|
// return new Promise((resolve, reject) => {
|
||||||
|
// const rsyncCommand = `rsync -av --progress "${sourceUrl}" "${destinationPath}"`;
|
||||||
|
// exec(rsyncCommand, (error, stdout, stderr) => {
|
||||||
|
// if (error) {
|
||||||
|
// console.error('Error downloading file with rsync:', stderr);
|
||||||
|
// reject(error);
|
||||||
|
// } else {
|
||||||
|
// console.log('File downloaded successfully:', stdout);
|
||||||
|
// resolve();
|
||||||
|
// }
|
||||||
|
// });
|
||||||
|
// });
|
||||||
|
// };
|
||||||
|
|
||||||
|
const parseSftpUrl = (sftpUrl) => {
|
||||||
|
const regex = /^(?<username>[^@]+)@(?<host>[^:]+):(?<path>.+)$/;
|
||||||
|
const match = sftpUrl.match(regex);
|
||||||
|
if (match) {
|
||||||
|
return {
|
||||||
|
username: match.groups.username,
|
||||||
|
host: match.groups.host,
|
||||||
|
path: match.groups.path
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
throw new Error('Invalid SFTP URL format');
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const downloadFileWithSftp = async (remotePath, localPath, options = {}) => {
|
||||||
|
const sftp = new SftpClient();
|
||||||
|
try {
|
||||||
|
// Connect to the SFTP server
|
||||||
|
const sftpUrlParts = parseSftpUrl(remotePath);
|
||||||
|
|
||||||
|
await sftp.connect({
|
||||||
|
host: sftpUrlParts.host,
|
||||||
|
username: sftpUrlParts.username,
|
||||||
|
privateKey: sshRsaKey,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Download the file
|
||||||
|
const writer = fs.createWriteStream(localPath, { start: 0 });
|
||||||
|
sftp.get(sftpUrlParts.path, writer);
|
||||||
|
|
||||||
|
//console.log('File downloaded successfully');
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
writer.on('finish', resolve);
|
||||||
|
writer.on('error', reject);
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error downloading file:', error);
|
||||||
|
} finally {
|
||||||
|
// Disconnect from the SFTP server
|
||||||
|
await sftp.end();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
async function downloadFile(url, filePath) {
|
||||||
|
// if (url.startsWith(baseUrl)) {
|
||||||
|
// //downloadFileWithRsync(url.replace(baseUrl, sftpBasePath), filePath);
|
||||||
|
// const sftpUrl = url.replace(baseUrl, sftpBasePath);
|
||||||
|
// console.log(`Downloading: ${sftpUrl}`);
|
||||||
|
// downloadFileWithSftp(sftpUrl, filePath);
|
||||||
|
// console.log(`Download complete: ${sftpUrl} => ${filePath}`);
|
||||||
|
// }
|
||||||
|
// else {
|
||||||
|
console.log(`Downloading: ${url}`);
|
||||||
|
const writer = fs.createWriteStream(filePath, { start: 0 });
|
||||||
|
|
||||||
|
const response = await axios({
|
||||||
|
url,
|
||||||
|
method: 'GET',
|
||||||
|
responseType: 'stream'
|
||||||
|
});
|
||||||
|
|
||||||
|
response.data.pipe(writer);
|
||||||
|
|
||||||
|
console.log(`Download complete: ${url} => ${filePath}`);
|
||||||
|
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
writer.on('finish', resolve);
|
||||||
|
writer.on('error', reject);
|
||||||
|
});
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
async function getSolrIndexedFileChecksum(url) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
async function extractAndIndexWithTika(url, solrUrl) {
|
||||||
|
try {
|
||||||
|
const tempFilePath = 'tempfile';
|
||||||
|
await downloadFile(url, tempFilePath);
|
||||||
|
const fileContent = fs.readFileSync(tempFilePath);
|
||||||
|
const checksum = crypto.createHash('sha256').update(fileContent).digest('hex');
|
||||||
|
|
||||||
|
// Query Solr to check if the file is already indexed
|
||||||
|
const solrChecksumResponse = await axios.get(`${solrUrl}/select?q=id:"${url}"&fl=sha256sum`);
|
||||||
|
const solrChecksum = solrChecksumResponse.data.response.docs[0]?.sha256sum;
|
||||||
|
|
||||||
|
if (solrChecksum && solrChecksum === checksum) {
|
||||||
|
console.log(`File ${filePath} hasn't changed. Skipping.`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make request to Tika
|
||||||
|
console.log(`Processing ${url}`);
|
||||||
|
// const response = await axios.post(tikaUrl + '/form', formData, {
|
||||||
|
// headers: {
|
||||||
|
// // 'Content-Type': 'application/octet-stream',
|
||||||
|
// //'Content-Type': extToMime(url),
|
||||||
|
// //'Content-Length': fs.
|
||||||
|
// 'Content-Type': 'multipart/form-data',
|
||||||
|
// 'X-Tika-Output-Format': 'solr',
|
||||||
|
// //'X-Tika-SolrUrl': solrUrl
|
||||||
|
// },
|
||||||
|
// timeout: 40000000
|
||||||
|
// });
|
||||||
|
const client = new TikaClient({ host: tikaUrl });
|
||||||
|
const version = await client.getVersion()
|
||||||
|
console.info(`version: ${version}`)
|
||||||
|
console.info(extToMime(url), await client.getContent(fs.createReadStream(tempFilePath), extToMime(url), path.basename(url)));
|
||||||
|
await client.pipe(fs.createReadStream(tempFilePath), fs.createWriteStream('output.txt'), 'text/plain', tempFilePath);
|
||||||
|
|
||||||
|
//console.log('Tika response:', fs.readFileSync('output.txt'));
|
||||||
|
const fileData = fs.readFileSync('output.txt');
|
||||||
|
//const contentLength = await fs.stat(tempFilePath).size;
|
||||||
|
// Parse XML response from Tika
|
||||||
|
|
||||||
|
const textContent = sanitizeIndexData(extractTextFromHtml(fileData));
|
||||||
|
|
||||||
|
// Create Solr document
|
||||||
|
const solrDocument = {
|
||||||
|
id: url, // Replace with a unique identifier for the document
|
||||||
|
text: textContent, // Add the extracted text content
|
||||||
|
sha256sum: checksum, // Add the checksum
|
||||||
|
//html: response.data,
|
||||||
|
url: url,
|
||||||
|
//content_length: contentLength,
|
||||||
|
content_type: extToMime(url),
|
||||||
|
// Add additional fields as needed (e.g., title, author, etc.)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Send document to Solr for indexing
|
||||||
|
console.log(`Indexing ${url}`);
|
||||||
|
await indexDocumentInSolr(solrDocument);
|
||||||
|
|
||||||
|
console.log('Document indexed successfully:', solrDocument.id);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error extracting text with Tika:', error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractTextFromHtml(html) {
|
||||||
|
// Parse HTML using Cheerio
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
|
// Extract text content from HTML
|
||||||
|
const textContent = $('body').text().trim();
|
||||||
|
|
||||||
|
return textContent;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function indexDocumentInSolr(document) {
|
||||||
|
try {
|
||||||
|
// Send document to Solr using the Solr REST API or a Solr client library
|
||||||
|
// Example code to send document using Axios:
|
||||||
|
await axios.post(solrUrl + '/update/json/docs', document, {
|
||||||
|
params: {
|
||||||
|
commit: true, // Commit changes immediately
|
||||||
|
},
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
throw new Error('Error indexing document in Solr: ' + error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function sanitizeIndexData(data) {
|
||||||
|
// Convert all whitespace characters to spaces
|
||||||
|
let sanitizedData = data.replace(/\s+/g, ' ');
|
||||||
|
|
||||||
|
// Remove double whitespace recursively
|
||||||
|
while (sanitizedData !== (sanitizedData = sanitizedData.replace(/ /g, ' '))) {}
|
||||||
|
|
||||||
|
return sanitizedData.trim(); // Trim leading and trailing spaces
|
||||||
|
}
|
||||||
|
|
||||||
|
async function clearSolrIndex() {
|
||||||
|
try {
|
||||||
|
// Send delete query to Solr to delete all documents
|
||||||
|
const response = await axios.post(solrUrl + '/update', {
|
||||||
|
delete: {
|
||||||
|
query: '*:*'
|
||||||
|
},
|
||||||
|
commit: {}
|
||||||
|
}, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('Deleted ' + response.data.responseHeader.status + ' documents');
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error clearing Solr index:', error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clears SOLR search index
|
||||||
|
const index_clear = async function() {
|
||||||
|
await clearSolrIndex();
|
||||||
|
};
|
||||||
|
index_clear.displayName = 'index:clear';
|
||||||
|
gulp.task(index_clear);
|
||||||
|
|
||||||
|
// Reindexes SOLR search index with plaintext results from Tika
|
||||||
|
const index_index = async function() {
|
||||||
|
for (let l = 0; l < filesToIndex.length; l++) {
|
||||||
|
let line = filesToIndex[l];
|
||||||
|
let documentUrl = baseUrl + line;
|
||||||
|
await extractAndIndexWithTika(documentUrl, solrUrl);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
index_index.displayName = 'index:index';
|
||||||
|
gulp.task(index_index);
|
||||||
|
|
||||||
|
// Define index tasks
|
||||||
|
gulp.task('index:reindex', gulp.series('index:clear', 'index:index'));
|
||||||
|
gulp.task('index', gulp.series('index:index'));
|
||||||
|
// Define a default task (optional)
|
||||||
|
gulp.task('default', gulp.series('index'));
|
194
index/gulpfile.js
Normal file
194
index/gulpfile.js
Normal file
|
@ -0,0 +1,194 @@
|
||||||
|
const gulp = require('gulp');
|
||||||
|
const gulpif = require('gulp-if');
|
||||||
|
const through2 = require('through2');
|
||||||
|
//const shell = require('gulp-shell');
|
||||||
|
const axios = require('axios');
|
||||||
|
const fs = require('fs');
|
||||||
|
const os = require ("os");
|
||||||
|
const path = require('path');
|
||||||
|
const crypto = require('crypto');
|
||||||
|
const cheerio = require('cheerio');
|
||||||
|
const { TikaClient } = require('tika-js');
|
||||||
|
|
||||||
|
const baseUrl = 'https://no-moss-3-carbo-landfill-library.online'; // URL of the document to download and index
|
||||||
|
const tikaUrl = 'http://solr.services.cleveland.daball.me:9998'; // URL of the Tika instance
|
||||||
|
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance
|
||||||
|
|
||||||
|
function extToMime(file_name) {
|
||||||
|
switch (path.extname(file_name)) {
|
||||||
|
case '.htm':
|
||||||
|
case '.html':
|
||||||
|
return 'text/html';
|
||||||
|
case '.pdf':
|
||||||
|
return 'application/pdf';
|
||||||
|
case '.md':
|
||||||
|
case '.txt':
|
||||||
|
default:
|
||||||
|
return 'text/plain';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function isFileExt(extname) {
|
||||||
|
const fileExtnameMatchesExtname = function(file) {
|
||||||
|
return file.extname == extname;
|
||||||
|
};
|
||||||
|
return file_extname_matches_extname;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function tikaReadPdf(file) {
|
||||||
|
const client = new TikaClient({ host: tikaUrl });
|
||||||
|
const file_reader = fs.createReadStream(file);
|
||||||
|
await client.pipe(file_reader, fs.createWriteStream('output.txt'), 'text/plain', tempFilePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
function index() {
|
||||||
|
src([
|
||||||
|
"public/Russell_County_IDA/**.pdf",
|
||||||
|
"public/Russell_County_IDA/**.pdf",
|
||||||
|
//['.txt', '.html', '.htm', '.md', '.pdf']
|
||||||
|
])
|
||||||
|
.pipe(gulpif(isFileExt(".pdf"), ))
|
||||||
|
}
|
||||||
|
|
||||||
|
async function extractAndIndexWithTika(url, solrUrl) {
|
||||||
|
try {
|
||||||
|
const tempFilePath = 'tempfile';
|
||||||
|
await downloadFile(url, tempFilePath);
|
||||||
|
const fileContent = fs.readFileSync(tempFilePath);
|
||||||
|
const checksum = crypto.createHash('sha256').update(fileContent).digest('hex');
|
||||||
|
|
||||||
|
// Query Solr to check if the file is already indexed
|
||||||
|
const solrChecksumResponse = await axios.get(`${solrUrl}/select?q=id:"${url}"&fl=sha256sum`);
|
||||||
|
const solrChecksum = solrChecksumResponse.data.response.docs[0]?.sha256sum;
|
||||||
|
|
||||||
|
if (solrChecksum && solrChecksum === checksum) {
|
||||||
|
console.log(`File ${filePath} hasn't changed. Skipping.`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make request to Tika
|
||||||
|
console.log(`Processing ${url}`);
|
||||||
|
// const response = await axios.post(tikaUrl + '/form', formData, {
|
||||||
|
// headers: {
|
||||||
|
// // 'Content-Type': 'application/octet-stream',
|
||||||
|
// //'Content-Type': extToMime(url),
|
||||||
|
// //'Content-Length': fs.
|
||||||
|
// 'Content-Type': 'multipart/form-data',
|
||||||
|
// 'X-Tika-Output-Format': 'solr',
|
||||||
|
// //'X-Tika-SolrUrl': solrUrl
|
||||||
|
// },
|
||||||
|
// timeout: 40000000
|
||||||
|
// });
|
||||||
|
const client = new TikaClient({ host: tikaUrl });
|
||||||
|
const version = await client.getVersion()
|
||||||
|
console.info(`version: ${version}`)
|
||||||
|
console.info(extToMime(url), await client.getContent(fs.createReadStream(tempFilePath), extToMime(url), path.basename(url)));
|
||||||
|
await client.pipe(fs.createReadStream(tempFilePath), fs.createWriteStream('output.txt'), 'text/plain', tempFilePath);
|
||||||
|
|
||||||
|
//console.log('Tika response:', fs.readFileSync('output.txt'));
|
||||||
|
const fileData = fs.readFileSync('output.txt');
|
||||||
|
//const contentLength = await fs.stat(tempFilePath).size;
|
||||||
|
// Parse XML response from Tika
|
||||||
|
|
||||||
|
const textContent = sanitizeIndexData(extractTextFromHtml(fileData));
|
||||||
|
|
||||||
|
// Create Solr document
|
||||||
|
const solrDocument = {
|
||||||
|
id: url, // Replace with a unique identifier for the document
|
||||||
|
text: textContent, // Add the extracted text content
|
||||||
|
sha256sum: checksum, // Add the checksum
|
||||||
|
//html: response.data,
|
||||||
|
url: url,
|
||||||
|
//content_length: contentLength,
|
||||||
|
content_type: extToMime(url),
|
||||||
|
// Add additional fields as needed (e.g., title, author, etc.)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Send document to Solr for indexing
|
||||||
|
console.log(`Indexing ${url}`);
|
||||||
|
await indexDocumentInSolr(solrDocument);
|
||||||
|
|
||||||
|
console.log('Document indexed successfully:', solrDocument.id);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error extracting text with Tika:', error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractTextFromHtml(html) {
|
||||||
|
// Parse HTML using Cheerio
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
|
// Extract text content from HTML
|
||||||
|
const textContent = $('body').text().trim();
|
||||||
|
|
||||||
|
return textContent;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function indexDocumentInSolr(document) {
|
||||||
|
try {
|
||||||
|
// Send document to Solr using the Solr REST API or a Solr client library
|
||||||
|
// Example code to send document using Axios:
|
||||||
|
await axios.post(solrUrl + '/update/json/docs', document, {
|
||||||
|
params: {
|
||||||
|
commit: true, // Commit changes immediately
|
||||||
|
},
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
throw new Error('Error indexing document in Solr: ' + error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function sanitizeIndexData(data) {
|
||||||
|
// Convert all whitespace characters to spaces
|
||||||
|
let sanitizedData = data.replace(/\s+/g, ' ');
|
||||||
|
|
||||||
|
// Remove double whitespace recursively
|
||||||
|
while (sanitizedData !== (sanitizedData = sanitizedData.replace(/ /g, ' '))) {}
|
||||||
|
|
||||||
|
return sanitizedData.trim(); // Trim leading and trailing spaces
|
||||||
|
}
|
||||||
|
|
||||||
|
async function clearSolrIndex() {
|
||||||
|
try {
|
||||||
|
// Send delete query to Solr to delete all documents
|
||||||
|
const response = await axios.post(solrUrl + '/update', {
|
||||||
|
delete: {
|
||||||
|
query: '*:*'
|
||||||
|
},
|
||||||
|
commit: {}
|
||||||
|
}, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('Deleted ' + response.data.responseHeader.status + ' documents');
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error clearing Solr index:', error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clears SOLR search index
|
||||||
|
const index_clear = async function() {
|
||||||
|
await clearSolrIndex();
|
||||||
|
};
|
||||||
|
index_clear.displayName = 'index:clear';
|
||||||
|
gulp.task(index_clear);
|
||||||
|
|
||||||
|
// Reindexes SOLR search index with plaintext results from Tika
|
||||||
|
const index_index = async function() {
|
||||||
|
for (let l = 0; l < filesToIndex.length; l++) {
|
||||||
|
let line = filesToIndex[l];
|
||||||
|
let documentUrl = baseUrl + line;
|
||||||
|
await extractAndIndexWithTika(documentUrl, solrUrl);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
index_index.displayName = 'index:index';
|
||||||
|
gulp.task(index_index);
|
||||||
|
|
||||||
|
// Define index tasks
|
||||||
|
gulp.task('index:reindex', gulp.series('index:clear', 'index:index'));
|
||||||
|
gulp.task('index', gulp.series('index:index'));
|
||||||
|
// Define a default task (optional)
|
||||||
|
gulp.task('default', gulp.series('index'));
|
0
index/output.txt
Normal file
0
index/output.txt
Normal file
5128
index/package-lock.json
generated
Normal file
5128
index/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
29
index/package.json
Normal file
29
index/package.json
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
{
|
||||||
|
"name": "example_tika_solr_index",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "",
|
||||||
|
"main": "example_tika_upload.js",
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1",
|
||||||
|
"index": "gulp index",
|
||||||
|
"reindex": "gulp index:reindex"
|
||||||
|
},
|
||||||
|
"author": "",
|
||||||
|
"license": "ISC",
|
||||||
|
"dependencies": {
|
||||||
|
"axios": "^1.6.7",
|
||||||
|
"cheerio": "^1.0.0-rc.12",
|
||||||
|
"ejs": "^3.1.9",
|
||||||
|
"express": "^4.18.3",
|
||||||
|
"gulp-if": "^3.0.0",
|
||||||
|
"js-yaml": "^4.1.0",
|
||||||
|
"ssh2-sftp-client": "^10.0.3",
|
||||||
|
"through2": "^4.0.2",
|
||||||
|
"tika-js": "^1.0.2",
|
||||||
|
"tslib": "^2.6.2",
|
||||||
|
"xml2js": "^0.6.2"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"gulp": "^4.0.2"
|
||||||
|
}
|
||||||
|
}
|
81
index/search_solr.js
Normal file
81
index/search_solr.js
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
const express = require('express');
|
||||||
|
const axios = require('axios');
|
||||||
|
const app = express();
|
||||||
|
const path = require('path');
|
||||||
|
|
||||||
|
// Set EJS as the view engine
|
||||||
|
app.set('view engine', 'ejs');
|
||||||
|
|
||||||
|
// Specify the views directory
|
||||||
|
app.set('views', path.join(__dirname, 'views'));
|
||||||
|
|
||||||
|
// Middleware to parse JSON request body
|
||||||
|
app.use(express.json());
|
||||||
|
|
||||||
|
// Serve static files (CSS, JavaScript, images, etc.)
|
||||||
|
app.use(express.static('public'));
|
||||||
|
|
||||||
|
// Search endpoint
|
||||||
|
app.get('/search', async (req, res) => {
|
||||||
|
try {
|
||||||
|
// Extract search query from request query parameters
|
||||||
|
const { query, page = 1, pageSize = 10 } = req.query;
|
||||||
|
|
||||||
|
// Validate search query
|
||||||
|
if (!query) {
|
||||||
|
return res.status(400).json({ error: 'Query parameter is required' });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate start offset for pagination
|
||||||
|
const start = (page - 1) * pageSize;
|
||||||
|
|
||||||
|
// Sanitize search query to prevent code injection
|
||||||
|
const sanitizedQuery = sanitizeQuery(query);
|
||||||
|
|
||||||
|
// Send search query to Solr
|
||||||
|
const response = await axios.get(solrUrl + '/select', {
|
||||||
|
params: {
|
||||||
|
q: `text:${sanitizedQuery}`, // Query string with field name
|
||||||
|
hl: 'true',
|
||||||
|
'hl.method': 'unified',
|
||||||
|
'hl.fl': '*',
|
||||||
|
'hl.snippets': 5,
|
||||||
|
'hl.tag.pre': '<b class=\"result-highlight\">',
|
||||||
|
'hl.tag.post': '</b>',
|
||||||
|
start, // Start offset for pagination
|
||||||
|
rows: 10, // Number of rows to return
|
||||||
|
wt: 'json', // Response format (JSON)
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Extract search results from Solr response
|
||||||
|
const searchResults = response.data.response.docs;
|
||||||
|
const highlightedSnippets = response.data.highlighting;
|
||||||
|
|
||||||
|
// Calculate total number of results (needed for pagination)
|
||||||
|
const totalResults = response.data.response.numFound;
|
||||||
|
|
||||||
|
// Calculate total number of pages
|
||||||
|
const totalPages = Math.ceil(totalResults / pageSize);
|
||||||
|
|
||||||
|
// Send search results as JSON response
|
||||||
|
//res.json({ searchResults, highlightedSnippets });
|
||||||
|
res.render('search-results', { query, searchResults, highlightedSnippets, page, pageSize, totalResults, totalPages });
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error searching Solr:', error.message);
|
||||||
|
res.status(500).json({ error: 'Internal server error' });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Function to sanitize search query to prevent code injection
|
||||||
|
function sanitizeQuery(query) {
|
||||||
|
// Remove any characters that are not alphanumeric or whitespace
|
||||||
|
return query.replace(/[^\w\s"]/gi, '');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start server
|
||||||
|
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance
|
||||||
|
const PORT = process.env.PORT || 3000;
|
||||||
|
app.listen(PORT, () => {
|
||||||
|
console.log(`Server is running on port ${PORT}`);
|
||||||
|
});
|
88
index/views/search-results.ejs
Normal file
88
index/views/search-results.ejs
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Search Results</title>
|
||||||
|
<!-- Bootstrap CSS -->
|
||||||
|
<link href="https://daball.me/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
|
||||||
|
<link href="https://fonts.googleapis.com/css?family=Saira+Extra+Condensed:100,200,300,400,500,600,700,800,900" rel="stylesheet">
|
||||||
|
<link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i,800,800i" rel="stylesheet">
|
||||||
|
<link href="https://daball.me/vendor/font-awesome/css/font-awesome.min.css" rel="stylesheet">
|
||||||
|
<link href="https://daball.me/vendor/devicons/css/devicons.min.css" rel="stylesheet">
|
||||||
|
<link href="https://daball.me/vendor/devicon/devicon.min.css" rel="stylesheet">
|
||||||
|
<link href="https://daball.me/vendor/simple-line-icons/css/simple-line-icons.css" rel="stylesheet">
|
||||||
|
<link href="https://daball.me/layouts/blog/css/blog.min.css" rel="stylesheet">
|
||||||
|
<style type="text/css"><!--
|
||||||
|
.result-highlight { background-color: #FBF719; font-weight: normal; }
|
||||||
|
// --></style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="navbar navbar-expand-lg fixed-top navbar-dark bg-primary">
|
||||||
|
<div class="container">
|
||||||
|
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation"><span class="navbar-toggler-icon"></span></button>
|
||||||
|
<div class="collapse navbar-collapse" id="navbarResponsive">
|
||||||
|
<a class="navbar-brand" href="/">David A. Ball</a>
|
||||||
|
<ul class="navbar-nav">
|
||||||
|
<li class="nav-item"><a class="nav-link" href="https://russell-county-archives.daball.me/">Russell County Archives</a></li>
|
||||||
|
</ul>
|
||||||
|
<!-- Search form -->
|
||||||
|
<form class="d-flex ms-auto">
|
||||||
|
<input class="form-control me-2" type="search" placeholder="Search" aria-label="Search" value="<s" name="query">
|
||||||
|
<button class="btn btn-outline-success" type="submit">Search</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="container">
|
||||||
|
<h1 class="mt-5">Search Results</h1>
|
||||||
|
<div id="searchResults" class="mt-3">
|
||||||
|
<!-- Search results will be dynamically populated here -->
|
||||||
|
<ul class="list-group">
|
||||||
|
<% searchResults.forEach(result => { %>
|
||||||
|
<li class="list-group-item">
|
||||||
|
<h5><%= result.title %></h5>
|
||||||
|
<% if (highlightedSnippets[result.id] && highlightedSnippets[result.id].text) { %>
|
||||||
|
<% highlightedSnippets[result.id].text.forEach(snippet => { %>
|
||||||
|
<p><%- snippet %></p>
|
||||||
|
<% }); %>
|
||||||
|
<% } else { %>
|
||||||
|
<p>No snippet available</p>
|
||||||
|
<% } %>
|
||||||
|
<a href="<%= result.url %>"><%= result.url %></a>
|
||||||
|
</li>
|
||||||
|
<% }); %>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Pagination controls -->
|
||||||
|
<nav aria-label="Search results pagination">
|
||||||
|
<ul class="pagination justify-content-center mt-4">
|
||||||
|
<% if (page > 1) { %>
|
||||||
|
<li class="page-item">
|
||||||
|
<a class="page-link" href="/search?query=<%= query %>&page=<%= page - 1 %>&pageSize=<%= pageSize %>">Previous</a>
|
||||||
|
</li>
|
||||||
|
<% } %>
|
||||||
|
<% for (let i = 1; i <= totalPages; i++) { %>
|
||||||
|
<li class="page-item <%= i === page ? 'active' : '' %>">
|
||||||
|
<a class="page-link" href="/search?query=<%= query %>&page=<%= i %>&pageSize=<%= pageSize %>"><%= i %></a>
|
||||||
|
</li>
|
||||||
|
<% } %>
|
||||||
|
<% if (page < totalPages) { %>
|
||||||
|
<li class="page-item">
|
||||||
|
<a class="page-link" href="/search?query=<%= query %>&page=<%= parseInt(page) + 1 %>&pageSize=<%= pageSize %>">Next</a>
|
||||||
|
</li>
|
||||||
|
<% } %>
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Bootstrap JS (optional, if you need Bootstrap JS features) -->
|
||||||
|
<script src="https://daball.me/vendor/jquery/jquery.min.js"></script>
|
||||||
|
<script src="https://daball.me/vendor/popper.js/dist/popper.min.js"></script>
|
||||||
|
<script src="https://daball.me/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
|
||||||
|
<script src="https://daball.me/vendor/jquery-easing/jquery.easing.min.js"></script>
|
||||||
|
<script src="https://daball.me/layouts/blog/js/blog.min.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
Loading…
Reference in New Issue
Block a user