85 lines
3.3 KiB
TypeScript
85 lines
3.3 KiB
TypeScript
/**
|
|
* Used temporarily until I get access to Amazon Product API.
|
|
*/
|
|
|
|
import cheerio, { type CheerioAPI } from 'cheerio';
|
|
import { type ProductDetails } from '../data/products/amazon-product-details';
|
|
import { type ProductAttribute } from '../data/products/product-attribute';
|
|
import { parseNumberFromSelector } from './utils';
|
|
|
|
/**
|
|
* CSS selectors for the product details. Feel free to figure out different variations of these selectors.
|
|
*/
|
|
const SELECTORS = {
|
|
TITLE: 'span#productTitle',
|
|
PRICE: 'span.priceToPay',
|
|
// LIST_PRICE: 'span.basisPrice .a-offscreen',
|
|
FEATURE_BULLETS: '#feature-bullets li span.a-list-item',
|
|
DESCRIPTION: '#productDescription',
|
|
REVIEW_RATING: '#acrPopover a > span',
|
|
REVIEW_COUNT: '#acrCustomerReviewText',
|
|
IMAGES: '#altImages .item img',
|
|
PRODUCT_ATTRIBUTE_ROWS: '#productOverview_feature_div tr',
|
|
ATTRIBUTES_LABEL: 'td:nth-of-type(1) span',
|
|
ATTRIBUTES_VALUE: 'td:nth-of-type(2) span',
|
|
} as const;
|
|
|
|
/**
|
|
* Extracts the product image URLs from the given Cheerio object.
|
|
* - We have to iterate over the image elements and extract the `src` attribute.
|
|
*/
|
|
const extractImageUrls = ($: CheerioAPI): string[] => {
|
|
const imageUrls = $(SELECTORS.IMAGES)
|
|
.map((_, imageEl) => $(imageEl).attr('src'))
|
|
.get(); // `get()` - Retrieve all elements matched by the Cheerio object, as an array. Removes `undefined` values.
|
|
|
|
return imageUrls;
|
|
};
|
|
|
|
/**
|
|
* Extracts the product attributes from the given Cheerio object.
|
|
* - We have to iterate over the attribute rows and extract both label and value for each row.
|
|
*/
|
|
const extractProductAttributes = ($: CheerioAPI): ProductAttribute[] => {
|
|
const attributeRowEls = $(SELECTORS.PRODUCT_ATTRIBUTE_ROWS).get();
|
|
|
|
const attributeRows = attributeRowEls.map((rowEl) => {
|
|
const label = $(rowEl).find(SELECTORS.ATTRIBUTES_LABEL).text();
|
|
const value = $(rowEl).find(SELECTORS.ATTRIBUTES_VALUE).text();
|
|
|
|
return { label, value };
|
|
});
|
|
|
|
return attributeRows;
|
|
};
|
|
|
|
/**
|
|
* Extracts the feature bullets from the given Cheerio object.
|
|
* - We have to iterate over the li elements and extract the text.
|
|
*/
|
|
const extractFeatureBullets = ($: CheerioAPI): string[] => {
|
|
const featureBullets = $(SELECTORS.FEATURE_BULLETS)
|
|
.map((_, featureBulletEl) => $(featureBulletEl).text().trim())
|
|
.get(); // `get()` - Retrieve all elements matched by the Cheerio object, as an array. Removes `undefined` values.
|
|
return featureBullets;
|
|
};
|
|
|
|
/**
|
|
* Scrapes the product details from the given Cheerio object.
|
|
*/
|
|
export const extractProductDetails = ($: CheerioAPI): ProductDetails => {
|
|
const title = $(SELECTORS.TITLE).text().trim();
|
|
const description = $(SELECTORS.DESCRIPTION).text()!.trim();
|
|
|
|
const price = parseNumberFromSelector($, SELECTORS.PRICE);
|
|
// const listPrice = parseNumberFromSelector($, SELECTORS.LIST_PRICE);
|
|
const reviewRating = parseNumberFromSelector($, SELECTORS.REVIEW_RATING);
|
|
const reviewCount = parseNumberFromSelector($, SELECTORS.REVIEW_COUNT);
|
|
|
|
const imageUrls = extractImageUrls($);
|
|
const attributes = extractProductAttributes($);
|
|
const featureBullets = extractFeatureBullets($);
|
|
|
|
return { title, description, featureBullets, price, /*listPrice,*/ reviewCount, reviewRating, imageUrls, attributes };
|
|
};
|