Switched to Lucene parser and Solr client for more robust search.

This commit is contained in:
David Ball 2024-03-14 03:12:53 -04:00
parent dbcd3ba2fc
commit e051222fb1
9 changed files with 230 additions and 212 deletions

View File

@ -1,83 +0,0 @@
const express = require('express');
const axios = require('axios');
const app = express();
const path = require('path');
// Set EJS as the view engine
app.set('view engine', 'ejs');
// Specify the views directory
app.set('views', path.join(__dirname, 'views'));
// Middleware to parse JSON request body
app.use(express.json());
// Serve static files (CSS, JavaScript, images, etc.)
app.use(express.static('public'));
// Search endpoint
app.get('/search', async (req, res) => {
try {
// Extract search query from request query parameters
const { q, page = 1, pageSize = 10 } = req.query;
const query = q;
// Validate search query
if (!query) {
return res.status(400).json({ error: 'q parameter is required' });
}
// Calculate start offset for pagination
const start = (page - 1) * pageSize;
// Sanitize search query to prevent code injection
const sanitizedQuery = sanitizeQuery(query);
// Send search query to Solr
const response = await axios.get(solrUrl + '/select', {
params: {
q: `text:${sanitizedQuery}`, // Query string with field name
hl: 'true',
'hl.method': 'unified',
'hl.fl': '*',
'hl.snippets': 5,
'hl.tag.pre': '<b class=\"result-highlight\">',
'hl.tag.post': '</b>',
'hl.usePhraseHighlighter': true,
start, // Start offset for pagination
rows: 10, // Number of rows to return
wt: 'json', // Response format (JSON)
},
});
// Extract search results from Solr response
const searchResults = response.data.response.docs;
const highlightedSnippets = response.data.highlighting;
// Calculate total number of results (needed for pagination)
const totalResults = response.data.response.numFound;
// Calculate total number of pages
const totalPages = Math.ceil(totalResults / pageSize);
// Send search results as JSON response
//res.json({ searchResults, highlightedSnippets });
res.render('search-results', { query, searchResults, highlightedSnippets, page, pageSize, totalResults, totalPages });
} catch (error) {
console.error('Error searching Solr:', error.message);
res.status(500).json({ error: 'Internal server error' });
}
});
// Function to sanitize search query to prevent code injection
function sanitizeQuery(query) {
// Remove any characters that are not alphanumeric or whitespace
return query.replace(/[^\w\s"]/gi, '');
}
// Start server
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance
const PORT = process.env.PORT || 3000;
app.listen(PORT, () => {
console.log(`Server is running on port ${PORT}`);
});

View File

@ -7,13 +7,12 @@ const glob = require('glob');
const matter = require('gray-matter');
const ejs = require('ejs');
const helpers = require('../views/helpers/functions');
const search = require('../routes/search');
// const advancedSearch = require('../routes/advanced-search');
// Port number for HTTP server
const port = process.env.PORT||3000;
// Solr instance URL
const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core';
// Set EJS as the view engine
app.set('view engine', 'ejs');
@ -36,6 +35,10 @@ app.use(express.json());
// res.send('Hello World!');
// })
// Search endpoints
app.use('/search', search.router);
// app.use('/advanced-search', advancedSearch.router);
// Endpoints for all the site's pages.
glob.globSync('pages/**/*.md', {
cwd: path.join(__dirname, '..'),
@ -58,67 +61,6 @@ glob.globSync('pages/**/*.md', {
});
});
// Search endpoint
app.get('/search', async (req, res) => {
// Extract search query from request query parameters
let { q, page = 1, pageSize = 10 } = req.query;
pageSize = Math.min(pageSize, 100); // cap at 100
const query = q;
// Calculate start offset for pagination
const start = (page - 1) * pageSize;
// Sanitize search query to prevent code injection
const sanitizedQuery = sanitizeQuery(query);
try {
// Validate search query
if (!query) {
//return res.status(400).json({ error: 'q parameter is required' });
res.render('search-error', { h: helpers, query: sanitizedQuery, error: { code: 400, message: 'Search query is required.'} });
}
// Send search query to Solr
const response = await axios.get(solrUrl + '/select', {
params: {
q: `text:${sanitizedQuery}`, // Query string with field name
hl: 'true',
'hl.method': 'unified',
'hl.fl': '*',
'hl.snippets': 5,
'hl.tag.pre': '<strong class=\"result-highlight\">',
'hl.tag.post': '</strong>',
'hl.usePhraseHighlighter': true,
start, // Start offset for pagination
rows: 10, // Number of rows to return
wt: 'json', // Response format (JSON)
},
});
// Extract search results from Solr response
const searchResults = response.data.response.docs;
const highlightedSnippets = response.data.highlighting;
// Calculate total number of results (needed for pagination)
const totalResults = response.data.response.numFound;
// Calculate total number of pages
const totalPages = Math.ceil(totalResults / pageSize);
// Send search results as JSON response
//res.json('search-results', { query, searchResults, highlightedSnippets, page, pageSize, totalResults, totalPages });
res.render('search-results', { h: helpers, query: sanitizedQuery, searchResults, highlightedSnippets, page, pageSize, totalResults, totalPages });
} catch (error) {
// console.error('Error searching Solr:', error.message);
// res.status(500).json({ error: 'Internal server error' });
res.render('search-error', { h: helpers, query: sanitizedQuery, error });
}
});
// Function to sanitize search query to prevent code injection
function sanitizeQuery(query) {
// Remove any characters that are not alphanumeric or whitespace
return query.replace(/[^\w\s*,."]/gi, '');
}
//app.get('/OCR-Encoded-PDFs/Russell-County-Web-Site_2024-02-13_19_50_Modified-With-OCR-Encoding**', rewriter.rewrite('/Web_Site_Archives/Russell_County_Web_Site-2024-02-13_19_50_Modified_With_OCR_Encoding/$1'));
app.get('*', async (req, res) => {

View File

@ -34,17 +34,17 @@ const directoryTemplate = (vals) => {
});
});
};
// const errorTemplate = (vals) => {
// return new Promise((resolve, reject) => {
// ejs.renderFile("views/error.ejs", { h: helpers, ...vals }, (err, str) => {
// if (err) {
// reject(err);
// } else {
// resolve(str);
// }
// });
// });
// };
const errorTemplate = (vals) => {
return new Promise((resolve, reject) => {
ejs.renderFile("views/error.ejs", { h: helpers, ...vals }, (err, str) => {
if (err) {
reject(err);
} else {
resolve(str);
}
});
});
};
const etags = new Map();

View File

@ -2,8 +2,9 @@
"name": "express",
"version": "1.0.0",
"description": "",
"main": "index.js",
"main": "app/server.js",
"scripts": {
"server": "node app/server.js",
"test": "echo \"Error: no test specified\" && exit 1",
"transpile:ts": "tsc -project tsconfig.build.json",
"index": "gulp index",
@ -26,6 +27,7 @@
"gulp": "^4.0.2",
"gulp-if": "^3.0.0",
"js-yaml": "^4.1.0",
"lucene": "^2.1.1",
"markdown-it": "^14.0.0",
"markdown-it-attrs": "^4.1.6",
"mime-types": "2.1.18",
@ -39,6 +41,7 @@
"path-to-regexp": "2.2.1",
"range-parser": "1.2.0",
"request-promise-native": "^1.0.9",
"solr-client": "^0.10.0-rc10",
"ssh2-sftp-client": "^10.0.3",
"superagent": "^8.1.2",
"through2": "^4.0.2",

146
routes/search.js Normal file
View File

@ -0,0 +1,146 @@
const express = require('express');
const router = express.Router();
const { parse, toString } = require('lucene');
const { createClient, Query } = require('solr-client');
const solrConfig = { host: 'solr.services.cleveland.daball.me', port: 8983, core: 'my_core' };
const helpers = require('../views/helpers/functions');
router.get('/', (req, res) => {
// Extract paging parameters from request query parameters
let { q, page = 1, pageSize = 10 } = req.query;
if (page instanceof String) page = parseInt(page);
if (pageSize instanceof String) pageSize = parseInt(pageSize);
// Cap at 100 max per page
pageSize = Math.min(pageSize, 100);
// Calculate start offset for pagination
const start = (page - 1) * pageSize;
if (!q || (typeof q === 'string' && q.trim() == "")) {
res.render('search-error', { h: helpers, query: q, error: { code: 400, message: 'Search query is required.'} });
}
else {
// Parse query
let parsedQuery = parse(q);
// Construct a Solr q field query string based on the extracted components
let qQuery = toString(parsedQuery);
// Generate a Solr query based on the query strings and additional parameters
let solrQuery = new Query().df('text').q(qQuery).start(start).rows(10).hl({ options: {
on: true,
q: qQuery,
fl: '*',
snippets: 5,
formatter: 'simple',
simplePre: `<b class="result-highlight">`,
simplePost: `</b>`,
highlightMultiTerm: true,
usePhraseHighlighter: true,
}});
// Create a Solr client
const solrClient = createClient({ host: 'solr.services.cleveland.daball.me', port: 8983, core: 'my_core' });
solrClient.search(solrQuery)
.then(solrResponse => {
//console.log(require('util').inspect(solrResponse, { showHidden: true, depth: null, colors: true }));
// overcome broken hl simplePre/simplePost implementation
let overrideHighlighting = {};
Object.keys(solrResponse.highlighting).forEach((highlight_key) => {
overrideHighlighting[highlight_key] = solrResponse.highlighting[highlight_key];
if (overrideHighlighting[highlight_key].text && overrideHighlighting[highlight_key].text.length > 0) {
overrideHighlighting[highlight_key].text = overrideHighlighting[highlight_key].text.map( (text) => {
return text.replaceAll("<em>", `<b class="result-highlight">`).replaceAll("</em>", "</b>")
});
}
});
solrResponse.highlighting = overrideHighlighting;
// Calculate total number of results (needed for pagination)
const totalResults = solrResponse.response.numFound;
// Calculate total number of pages
const totalPages = Math.ceil(totalResults / pageSize);
res.render('search-results', {
h: helpers,
query: qQuery,
page,
pageSize,
totalResults,
totalPages,
solrQuery: solrQuery,
...solrResponse
});
// res.render('search-error', { h: helpers, query: sanitizedQuery, error: { code: 400, message: 'Search query is required.'} });
})
.catch(error => {
if (typeof error === 'object' && error instanceof Error) {
// check for error from throw new Error(`Request HTTP error ${response.statusCode}: ${text}`) in solr.ts from
// solr-node-client dependency
const detectRequestHttpErrorRegExLit = /^Request HTTP error (?<statusCode>\d{1,3}): (?<text>\{.*\}$)/s;
const detectRequestHttpErrorRegExp = new RegExp(detectRequestHttpErrorRegExLit);
const matchRequestHttpErrorRegExpInError = error.message.match(detectRequestHttpErrorRegExp);
const statusCode = (matchRequestHttpErrorRegExpInError && matchRequestHttpErrorRegExpInError.groups && matchRequestHttpErrorRegExpInError.groups.statusCode);
const text = (matchRequestHttpErrorRegExpInError && matchRequestHttpErrorRegExpInError.groups && matchRequestHttpErrorRegExpInError.groups.text);
if (text) {
let solrRequestHttpInternalError = JSON.parse(text);
error = {
message: "Solr Client Request HTTP Error",
code: statusCode,
innerError: solrRequestHttpInternalError
};
}
else {
error = {
message: error
};
}
}
res.render('search-error', { h: helpers, query: qQuery, error });
});
}
// // Sanitize search query to prevent code injection
// try {
// // Validate search query
// if (!query) {
// //return res.status(400).json({ error: 'q parameter is required' });
//
// }
// else {
// // Send search query to Solr
// const response = await axios.get(solrUrl + '/select', {
// params: {
// q: `text:${sanitizedQuery}`, // Query string with field name
// hl: 'true',
// 'hl.method': 'unified',
// 'hl.fl': '*',
// 'hl.snippets': 5,
// 'hl.tag.pre': '<strong class=\"result-highlight\">',
// 'hl.tag.post': '</strong>',
// 'hl.usePhraseHighlighter': true,
// start, // Start offset for pagination
// rows: 10, // Number of rows to return
// wt: 'json', // Response format (JSON)
// },
// });
//
// // Extract search results from Solr response
// const searchResults = response.data.response.docs;
// const highlightedSnippets = response.data.highlighting;
// // Calculate total number of results (needed for pagination)
// const totalResults = response.data.response.numFound;
// // Calculate total number of pages
// const totalPages = Math.ceil(totalResults / pageSize);
// // Send search results as JSON response
// //res.json('search-results', { query, searchResults, highlightedSnippets, page, pageSize, totalResults, totalPages });
// res.render('search-results', { h: helpers, query: sanitizedQuery, searchResults, highlightedSnippets, page, pageSize, totalResults, totalPages });
// }
// } catch (error) {
// // console.error('Error searching Solr:', error.message);
// // res.status(500).json({ error: 'Internal server error' });
// res.render('search-error', { h: helpers, query: sanitizedQuery, error });
// }
});
module.exports = {
router,
// solrUrl,
// sanitizeQuery,
};

View File

@ -113,8 +113,8 @@
<body>
<main>
<section>
<span><%= statusCode %></span>
<p><%= message %></p>
<span><% if (typeof statusCode !== 'undefined') { %><%= statusCode %><% } %></span>
<p><% if (typeof message !== 'undefined') { %><%= message %><% } %></p>
</section>
</main>
</body>

View File

@ -22,11 +22,14 @@
</p>
</div>
<div id="searchError" class="mt-3 shadow-lg p-lg-5">
<% if (typeof error != undefined) {%>
<% if (typeof error !== 'undefined') {%>
<p>An error occurred while attempting to perform a search.</p>
<% if (typeof query != undefined) {%><p><b>Search Query:</b> <span id="search-query"><%= query %></span></p><% } %>
<% if (typeof error.code != undefined) {%><p><b>Error Code:</b> <span id="error-code"><%= error.code %></span></p><% } %>
<% if (typeof error.message != undefined) {%><p><b>Error Message:</b> <span id="error-code"><%= error.message %></span></p><% } %>
<% if (typeof query !== 'undefined') {%><p><b>Search Query:</b> <span id="search-query"><%= query %></span></p><% } %>
<% if (typeof error.code !== 'undefined') {%><p><b>Error Code:</b> <span id="error-code"><%= error.code %></span></p><% } %>
<% if (typeof error.message !== 'undefined') {%><p><b>Error Message:</b> <span id="error-message"><%= error.message %></span></p><% } %>
<% if (typeof error.innerError !== 'undefined' && typeof error.innerError.error !== 'undefined') { %>
<% if (typeof error.innerError.error.msg !== 'undefined') {%><p><b>Inner Error Message:</b> <span id="inner-error-message"><%- error.innerError.error.msg.replaceAll("<", "&lt;").replaceAll(">", "&gt;").replaceAll("\n", '<br>') %></span></p><% } %>
<% } %>
<% } %>
</div>

View File

@ -15,55 +15,60 @@
Search Results for <%- query %>
</h1>
</header>
<div>
<p>
Disclaimer: Use of the search feature is subject to both the <a href="/search-policy">Search
Policy</a> and the <a href="/privacy-policy">Privacy Policy</a>.
</p>
</div>
<div id="searchResults" class="mt-3 shadow-lg">
<!-- Search results will be dynamically populated here -->
<% if (searchResults.length === 0) { %>
<p>No documents found matching the search query.</p>
<% } else { %>
<ul class="list-group shadow-lg">
<% searchResults.forEach(result => { %>
<li class="list-group-item list-group-item-action flex-column align-items-start">
<h5><%= result.title %></h5>
<% if (highlightedSnippets[result.id] && highlightedSnippets[result.id].text) { %>
<% highlightedSnippets[result.id].text.forEach(snippet => { %>
<p><%- snippet %></p>
<% }); %>
<% } else { %>
<p>No snippet available</p>
<% } %>
<a href="<%= result.url %>"><%= result.url %></a>
</li>
<% }); %>
</ul>
<% } %>
</div>
<div class="container">
<p>
Disclaimer: Use of the search feature is subject to both the <a href="/search-policy">Search
Policy</a> and the <a href="/privacy-policy">Privacy Policy</a>.
</p>
</div>
<% if (typeof response !== "undefined" && typeof response.numFound !== "undefined" && typeof response.docs !== "undefined" && typeof highlighting !== "undefined") { %>
<div id="searchResults" class="mt-3 shadow-lg">
<!-- Search results will be dynamically populated here -->
<% if (response.numFound == 0) { %>
<p>No documents found matching the search query.</p>
<% } else { %>
<ul class="list-group shadow-lg">
<% response.docs.forEach(doc => { %>
<li class="list-group-item list-group-item-action flex-column align-items-start">
<h5><%= doc.title %></h5>
<% if (highlighting[doc.id] && highlighting[doc.id].text) { %>
<% highlighting[doc.id].text.forEach(snippet => { %>
<p><%- snippet %></p>
<% }); %>
<% } else { %>
<!-- <p>No highlight available.</p> -->
<% } %>
<a href="<%= doc.url %>"><%= doc.url %></a>
</li>
<% }); %>
</ul>
<% } %>
</div>
</div>
<% } %>
<!-- Pagination controls -->
<nav aria-label="Search results pagination">
<ul class="pagination justify-content-center mt-4">
<% if (page > 1) { %>
<li class="page-item">
<a class="page-link" href="/search?q=<%= query %>&page=<%= page - 1 %>&pageSize=<%= pageSize %>">Previous</a>
</li>
<% } %>
<% for (let i = 1; i <= totalPages; i++) { %>
<li class="page-item <%= i === page ? 'active' : '' %>">
<a class="page-link" href="/search?q=<%= query %>&page=<%= i %>&pageSize=<%= pageSize %>"><%= i %></a>
</li>
<% } %>
<% if (page < totalPages) { %>
<li class="page-item">
<a class="page-link" href="/search?q=<%= query %>&page=<%= parseInt(page) + 1 %>&pageSize=<%= pageSize %>">Next</a>
</li>
<% } %>
</ul>
</nav>
<% if (typeof totalPages !== "undefined") { %>
<nav aria-label="Search results pagination">
<ul class="pagination justify-content-center mt-4">
<% if (page > 1) { %>
<li class="page-item">
<a class="page-link" href="/search?q=<%= query %>&page=<%= page - 1 %>&pageSize=<%= pageSize %>">Previous</a>
</li>
<% } %>
<% for (let i = Math.max(Math.min(page - 7, totalPages - 14), 1); i <= Math.min(totalPages, Math.max(page - 7, 1) + 14); i++) { %>
<li class="page-item <%= i == page ? 'active' : '' %>">
<a class="page-link" href="/search?q=<%= query %>&page=<%= i %>&pageSize=<%= pageSize %>"><%= i %></a>
</li>
<% } %>
<% if (page < totalPages) { %>
<li class="page-item">
<a class="page-link" href="/search?q=<%= query %>&page=<%= parseInt(page) + 1 %>&pageSize=<%= pageSize %>">Next</a>
</li>
<% } %>
</ul>
</nav>
<p class="center">Page <%= page %> out of <%= totalPages %>. Displaying results <%= (page-1)*pageSize+1 %> through <%= Math.min(page*pageSize, totalResults) %> out of <%= totalResults %> total results.</p>
<% } %>
</main>
<%- include('./includes/bottom-navbar.ejs') %>

View File

@ -18,11 +18,13 @@
<security>
<requestFiltering>
<hiddenSegments>
<add segment=".git" />
<add segment="node_modules" />
<add segment="iisnode" />
</hiddenSegments>
</requestFiltering>
</security>
<httpErrors errorMode="Detailed" />
<urlCompression doDynamicCompression="false" />
</system.webServer>
</configuration>