From e051222fb173efc0b6fcdb2c36c4759a18dd908c Mon Sep 17 00:00:00 2001 From: David Ball Date: Thu, 14 Mar 2024 03:12:53 -0400 Subject: [PATCH] Switched to Lucene parser and Solr client for more robust search. --- app/search.js | 83 ---------------------- app/server.js | 70 ++----------------- app/vercel-serve.js | 22 +++--- package.json | 5 +- routes/search.js | 146 +++++++++++++++++++++++++++++++++++++++ views/error.ejs | 4 +- views/search-error.ejs | 11 +-- views/search-results.ejs | 99 +++++++++++++------------- web.config | 2 + 9 files changed, 230 insertions(+), 212 deletions(-) delete mode 100644 app/search.js create mode 100644 routes/search.js diff --git a/app/search.js b/app/search.js deleted file mode 100644 index 9fa56a7d..00000000 --- a/app/search.js +++ /dev/null @@ -1,83 +0,0 @@ -const express = require('express'); -const axios = require('axios'); -const app = express(); -const path = require('path'); - -// Set EJS as the view engine -app.set('view engine', 'ejs'); - -// Specify the views directory -app.set('views', path.join(__dirname, 'views')); - -// Middleware to parse JSON request body -app.use(express.json()); - -// Serve static files (CSS, JavaScript, images, etc.) -app.use(express.static('public')); - -// Search endpoint -app.get('/search', async (req, res) => { - try { - // Extract search query from request query parameters - const { q, page = 1, pageSize = 10 } = req.query; - const query = q; - - // Validate search query - if (!query) { - return res.status(400).json({ error: 'q parameter is required' }); - } - - // Calculate start offset for pagination - const start = (page - 1) * pageSize; - - // Sanitize search query to prevent code injection - const sanitizedQuery = sanitizeQuery(query); - - // Send search query to Solr - const response = await axios.get(solrUrl + '/select', { - params: { - q: `text:${sanitizedQuery}`, // Query string with field name - hl: 'true', - 'hl.method': 'unified', - 'hl.fl': '*', - 'hl.snippets': 5, - 'hl.tag.pre': '', - 'hl.tag.post': '', - 'hl.usePhraseHighlighter': true, - start, // Start offset for pagination - rows: 10, // Number of rows to return - wt: 'json', // Response format (JSON) - }, - }); - - // Extract search results from Solr response - const searchResults = response.data.response.docs; - const highlightedSnippets = response.data.highlighting; - - // Calculate total number of results (needed for pagination) - const totalResults = response.data.response.numFound; - - // Calculate total number of pages - const totalPages = Math.ceil(totalResults / pageSize); - - // Send search results as JSON response - //res.json({ searchResults, highlightedSnippets }); - res.render('search-results', { query, searchResults, highlightedSnippets, page, pageSize, totalResults, totalPages }); - } catch (error) { - console.error('Error searching Solr:', error.message); - res.status(500).json({ error: 'Internal server error' }); - } -}); - -// Function to sanitize search query to prevent code injection -function sanitizeQuery(query) { - // Remove any characters that are not alphanumeric or whitespace - return query.replace(/[^\w\s"]/gi, ''); -} - -// Start server -const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; // URL of your Solr instance -const PORT = process.env.PORT || 3000; -app.listen(PORT, () => { - console.log(`Server is running on port ${PORT}`); -}); diff --git a/app/server.js b/app/server.js index 7fb675b5..62b70afd 100644 --- a/app/server.js +++ b/app/server.js @@ -7,13 +7,12 @@ const glob = require('glob'); const matter = require('gray-matter'); const ejs = require('ejs'); const helpers = require('../views/helpers/functions'); +const search = require('../routes/search'); +// const advancedSearch = require('../routes/advanced-search'); // Port number for HTTP server const port = process.env.PORT||3000; -// Solr instance URL -const solrUrl = 'http://solr.services.cleveland.daball.me:8983/solr/my_core'; - // Set EJS as the view engine app.set('view engine', 'ejs'); @@ -36,6 +35,10 @@ app.use(express.json()); // res.send('Hello World!'); // }) +// Search endpoints +app.use('/search', search.router); +// app.use('/advanced-search', advancedSearch.router); + // Endpoints for all the site's pages. glob.globSync('pages/**/*.md', { cwd: path.join(__dirname, '..'), @@ -58,67 +61,6 @@ glob.globSync('pages/**/*.md', { }); }); -// Search endpoint -app.get('/search', async (req, res) => { - // Extract search query from request query parameters - let { q, page = 1, pageSize = 10 } = req.query; - pageSize = Math.min(pageSize, 100); // cap at 100 - const query = q; - // Calculate start offset for pagination - const start = (page - 1) * pageSize; - - // Sanitize search query to prevent code injection - const sanitizedQuery = sanitizeQuery(query); - try { - // Validate search query - if (!query) { - //return res.status(400).json({ error: 'q parameter is required' }); - res.render('search-error', { h: helpers, query: sanitizedQuery, error: { code: 400, message: 'Search query is required.'} }); - } - - // Send search query to Solr - const response = await axios.get(solrUrl + '/select', { - params: { - q: `text:${sanitizedQuery}`, // Query string with field name - hl: 'true', - 'hl.method': 'unified', - 'hl.fl': '*', - 'hl.snippets': 5, - 'hl.tag.pre': '', - 'hl.tag.post': '', - 'hl.usePhraseHighlighter': true, - start, // Start offset for pagination - rows: 10, // Number of rows to return - wt: 'json', // Response format (JSON) - }, - }); - - // Extract search results from Solr response - const searchResults = response.data.response.docs; - const highlightedSnippets = response.data.highlighting; - - // Calculate total number of results (needed for pagination) - const totalResults = response.data.response.numFound; - - // Calculate total number of pages - const totalPages = Math.ceil(totalResults / pageSize); - - // Send search results as JSON response - //res.json('search-results', { query, searchResults, highlightedSnippets, page, pageSize, totalResults, totalPages }); - res.render('search-results', { h: helpers, query: sanitizedQuery, searchResults, highlightedSnippets, page, pageSize, totalResults, totalPages }); - } catch (error) { - // console.error('Error searching Solr:', error.message); - // res.status(500).json({ error: 'Internal server error' }); - res.render('search-error', { h: helpers, query: sanitizedQuery, error }); - } -}); - -// Function to sanitize search query to prevent code injection -function sanitizeQuery(query) { - // Remove any characters that are not alphanumeric or whitespace - return query.replace(/[^\w\s*,."]/gi, ''); -} - //app.get('/OCR-Encoded-PDFs/Russell-County-Web-Site_2024-02-13_19_50_Modified-With-OCR-Encoding**', rewriter.rewrite('/Web_Site_Archives/Russell_County_Web_Site-2024-02-13_19_50_Modified_With_OCR_Encoding/$1')); app.get('*', async (req, res) => { diff --git a/app/vercel-serve.js b/app/vercel-serve.js index 7f684562..66e176d3 100644 --- a/app/vercel-serve.js +++ b/app/vercel-serve.js @@ -34,17 +34,17 @@ const directoryTemplate = (vals) => { }); }); }; -// const errorTemplate = (vals) => { -// return new Promise((resolve, reject) => { -// ejs.renderFile("views/error.ejs", { h: helpers, ...vals }, (err, str) => { -// if (err) { -// reject(err); -// } else { -// resolve(str); -// } -// }); -// }); -// }; +const errorTemplate = (vals) => { + return new Promise((resolve, reject) => { + ejs.renderFile("views/error.ejs", { h: helpers, ...vals }, (err, str) => { + if (err) { + reject(err); + } else { + resolve(str); + } + }); + }); +}; const etags = new Map(); diff --git a/package.json b/package.json index 5282619a..178f59cc 100644 --- a/package.json +++ b/package.json @@ -2,8 +2,9 @@ "name": "express", "version": "1.0.0", "description": "", - "main": "index.js", + "main": "app/server.js", "scripts": { + "server": "node app/server.js", "test": "echo \"Error: no test specified\" && exit 1", "transpile:ts": "tsc -project tsconfig.build.json", "index": "gulp index", @@ -26,6 +27,7 @@ "gulp": "^4.0.2", "gulp-if": "^3.0.0", "js-yaml": "^4.1.0", + "lucene": "^2.1.1", "markdown-it": "^14.0.0", "markdown-it-attrs": "^4.1.6", "mime-types": "2.1.18", @@ -39,6 +41,7 @@ "path-to-regexp": "2.2.1", "range-parser": "1.2.0", "request-promise-native": "^1.0.9", + "solr-client": "^0.10.0-rc10", "ssh2-sftp-client": "^10.0.3", "superagent": "^8.1.2", "through2": "^4.0.2", diff --git a/routes/search.js b/routes/search.js new file mode 100644 index 00000000..92b3d296 --- /dev/null +++ b/routes/search.js @@ -0,0 +1,146 @@ +const express = require('express'); +const router = express.Router(); +const { parse, toString } = require('lucene'); +const { createClient, Query } = require('solr-client'); +const solrConfig = { host: 'solr.services.cleveland.daball.me', port: 8983, core: 'my_core' }; +const helpers = require('../views/helpers/functions'); + +router.get('/', (req, res) => { + // Extract paging parameters from request query parameters + let { q, page = 1, pageSize = 10 } = req.query; + if (page instanceof String) page = parseInt(page); + if (pageSize instanceof String) pageSize = parseInt(pageSize); + // Cap at 100 max per page + pageSize = Math.min(pageSize, 100); + // Calculate start offset for pagination + const start = (page - 1) * pageSize; + if (!q || (typeof q === 'string' && q.trim() == "")) { + res.render('search-error', { h: helpers, query: q, error: { code: 400, message: 'Search query is required.'} }); + } + else { + // Parse query + let parsedQuery = parse(q); + // Construct a Solr q field query string based on the extracted components + let qQuery = toString(parsedQuery); + // Generate a Solr query based on the query strings and additional parameters + let solrQuery = new Query().df('text').q(qQuery).start(start).rows(10).hl({ options: { + on: true, + q: qQuery, + fl: '*', + snippets: 5, + formatter: 'simple', + simplePre: ``, + simplePost: ``, + highlightMultiTerm: true, + usePhraseHighlighter: true, + }}); + // Create a Solr client + const solrClient = createClient({ host: 'solr.services.cleveland.daball.me', port: 8983, core: 'my_core' }); + solrClient.search(solrQuery) + .then(solrResponse => { + //console.log(require('util').inspect(solrResponse, { showHidden: true, depth: null, colors: true })); + // overcome broken hl simplePre/simplePost implementation + let overrideHighlighting = {}; + Object.keys(solrResponse.highlighting).forEach((highlight_key) => { + overrideHighlighting[highlight_key] = solrResponse.highlighting[highlight_key]; + if (overrideHighlighting[highlight_key].text && overrideHighlighting[highlight_key].text.length > 0) { + overrideHighlighting[highlight_key].text = overrideHighlighting[highlight_key].text.map( (text) => { + return text.replaceAll("", ``).replaceAll("", "") + }); + } + }); + solrResponse.highlighting = overrideHighlighting; + // Calculate total number of results (needed for pagination) + const totalResults = solrResponse.response.numFound; + // Calculate total number of pages + const totalPages = Math.ceil(totalResults / pageSize); + res.render('search-results', { + h: helpers, + query: qQuery, + page, + pageSize, + totalResults, + totalPages, + solrQuery: solrQuery, + ...solrResponse + }); + // res.render('search-error', { h: helpers, query: sanitizedQuery, error: { code: 400, message: 'Search query is required.'} }); + }) + .catch(error => { + if (typeof error === 'object' && error instanceof Error) { + // check for error from throw new Error(`Request HTTP error ${response.statusCode}: ${text}`) in solr.ts from + // solr-node-client dependency + const detectRequestHttpErrorRegExLit = /^Request HTTP error (?\d{1,3}): (?\{.*\}$)/s; + const detectRequestHttpErrorRegExp = new RegExp(detectRequestHttpErrorRegExLit); + const matchRequestHttpErrorRegExpInError = error.message.match(detectRequestHttpErrorRegExp); + const statusCode = (matchRequestHttpErrorRegExpInError && matchRequestHttpErrorRegExpInError.groups && matchRequestHttpErrorRegExpInError.groups.statusCode); + const text = (matchRequestHttpErrorRegExpInError && matchRequestHttpErrorRegExpInError.groups && matchRequestHttpErrorRegExpInError.groups.text); + if (text) { + let solrRequestHttpInternalError = JSON.parse(text); + error = { + message: "Solr Client Request HTTP Error", + code: statusCode, + innerError: solrRequestHttpInternalError + }; + } + else { + error = { + message: error + }; + } + } + res.render('search-error', { h: helpers, query: qQuery, error }); + }); + } + + // // Sanitize search query to prevent code injection + // try { + // // Validate search query + // if (!query) { + // //return res.status(400).json({ error: 'q parameter is required' }); + // + // } + // else { + // // Send search query to Solr + // const response = await axios.get(solrUrl + '/select', { + // params: { + // q: `text:${sanitizedQuery}`, // Query string with field name + // hl: 'true', + // 'hl.method': 'unified', + // 'hl.fl': '*', + // 'hl.snippets': 5, + // 'hl.tag.pre': '', + // 'hl.tag.post': '', + // 'hl.usePhraseHighlighter': true, + // start, // Start offset for pagination + // rows: 10, // Number of rows to return + // wt: 'json', // Response format (JSON) + // }, + // }); + // + // // Extract search results from Solr response + // const searchResults = response.data.response.docs; + // const highlightedSnippets = response.data.highlighting; + + // // Calculate total number of results (needed for pagination) + // const totalResults = response.data.response.numFound; + + // // Calculate total number of pages + // const totalPages = Math.ceil(totalResults / pageSize); + + // // Send search results as JSON response + // //res.json('search-results', { query, searchResults, highlightedSnippets, page, pageSize, totalResults, totalPages }); + // res.render('search-results', { h: helpers, query: sanitizedQuery, searchResults, highlightedSnippets, page, pageSize, totalResults, totalPages }); + // } + // } catch (error) { + // // console.error('Error searching Solr:', error.message); + // // res.status(500).json({ error: 'Internal server error' }); + // res.render('search-error', { h: helpers, query: sanitizedQuery, error }); + // } +}); + +module.exports = { + router, + // solrUrl, + // sanitizeQuery, +}; \ No newline at end of file diff --git a/views/error.ejs b/views/error.ejs index 6fe37907..91181817 100644 --- a/views/error.ejs +++ b/views/error.ejs @@ -113,8 +113,8 @@
- <%= statusCode %> -

<%= message %>

+ <% if (typeof statusCode !== 'undefined') { %><%= statusCode %><% } %> +

<% if (typeof message !== 'undefined') { %><%= message %><% } %>

diff --git a/views/search-error.ejs b/views/search-error.ejs index 4036815c..8cff172e 100644 --- a/views/search-error.ejs +++ b/views/search-error.ejs @@ -22,11 +22,14 @@

- <% if (typeof error != undefined) {%> + <% if (typeof error !== 'undefined') {%>

An error occurred while attempting to perform a search.

- <% if (typeof query != undefined) {%>

Search Query: <%= query %>

<% } %> - <% if (typeof error.code != undefined) {%>

Error Code: <%= error.code %>

<% } %> - <% if (typeof error.message != undefined) {%>

Error Message: <%= error.message %>

<% } %> + <% if (typeof query !== 'undefined') {%>

Search Query: <%= query %>

<% } %> + <% if (typeof error.code !== 'undefined') {%>

Error Code: <%= error.code %>

<% } %> + <% if (typeof error.message !== 'undefined') {%>

Error Message: <%= error.message %>

<% } %> + <% if (typeof error.innerError !== 'undefined' && typeof error.innerError.error !== 'undefined') { %> + <% if (typeof error.innerError.error.msg !== 'undefined') {%>

Inner Error Message: <%- error.innerError.error.msg.replaceAll("<", "<").replaceAll(">", ">").replaceAll("\n", '
') %>

<% } %> + <% } %> <% } %>
diff --git a/views/search-results.ejs b/views/search-results.ejs index f8005298..aab3eeb1 100644 --- a/views/search-results.ejs +++ b/views/search-results.ejs @@ -15,55 +15,60 @@ Search Results for <%- query %> -
-

- Disclaimer: Use of the search feature is subject to both the Search - Policy and the Privacy Policy. -

-
-
- - <% if (searchResults.length === 0) { %> -

No documents found matching the search query.

- <% } else { %> -
    - <% searchResults.forEach(result => { %> -
  • -
    <%= result.title %>
    - <% if (highlightedSnippets[result.id] && highlightedSnippets[result.id].text) { %> - <% highlightedSnippets[result.id].text.forEach(snippet => { %> -

    <%- snippet %>

    - <% }); %> - <% } else { %> -

    No snippet available

    - <% } %> - <%= result.url %> -
  • - <% }); %> -
- <% } %> -
+
+

+ Disclaimer: Use of the search feature is subject to both the Search + Policy and the Privacy Policy. +

+ <% if (typeof response !== "undefined" && typeof response.numFound !== "undefined" && typeof response.docs !== "undefined" && typeof highlighting !== "undefined") { %> +
+ + <% if (response.numFound == 0) { %> +

No documents found matching the search query.

+ <% } else { %> +
    + <% response.docs.forEach(doc => { %> +
  • +
    <%= doc.title %>
    + <% if (highlighting[doc.id] && highlighting[doc.id].text) { %> + <% highlighting[doc.id].text.forEach(snippet => { %> +

    <%- snippet %>

    + <% }); %> + <% } else { %> + + <% } %> + <%= doc.url %> +
  • + <% }); %> +
+ <% } %> +
+ + <% } %> - + <% if (typeof totalPages !== "undefined") { %> + +

Page <%= page %> out of <%= totalPages %>. Displaying results <%= (page-1)*pageSize+1 %> through <%= Math.min(page*pageSize, totalResults) %> out of <%= totalResults %> total results.

+ <% } %> <%- include('./includes/bottom-navbar.ejs') %> diff --git a/web.config b/web.config index 0860aaa7..6fdec002 100644 --- a/web.config +++ b/web.config @@ -18,11 +18,13 @@ + + \ No newline at end of file