dashersupply/src/scraper/amazon.ts

85 lines
3.3 KiB
TypeScript

/**
* Used temporarily until I get access to Amazon Product API.
*/
import cheerio, { type CheerioAPI } from 'cheerio';
import { type ProductDetails } from '../data/products/amazon-product-details';
import { type ProductAttribute } from '../data/products/product-attribute';
import { parseNumberFromSelector } from './utils';
/**
* CSS selectors for the product details. Feel free to figure out different variations of these selectors.
*/
const SELECTORS = {
TITLE: 'span#productTitle',
PRICE: 'span.priceToPay',
// LIST_PRICE: 'span.basisPrice .a-offscreen',
FEATURE_BULLETS: '#feature-bullets li span.a-list-item',
DESCRIPTION: '#productDescription',
REVIEW_RATING: '#acrPopover a > span',
REVIEW_COUNT: '#acrCustomerReviewText',
IMAGES: '#altImages .item img',
PRODUCT_ATTRIBUTE_ROWS: '#productOverview_feature_div tr',
ATTRIBUTES_LABEL: 'td:nth-of-type(1) span',
ATTRIBUTES_VALUE: 'td:nth-of-type(2) span',
} as const;
/**
* Extracts the product image URLs from the given Cheerio object.
* - We have to iterate over the image elements and extract the `src` attribute.
*/
const extractImageUrls = ($: CheerioAPI): string[] => {
const imageUrls = $(SELECTORS.IMAGES)
.map((_, imageEl) => $(imageEl).attr('src'))
.get(); // `get()` - Retrieve all elements matched by the Cheerio object, as an array. Removes `undefined` values.
return imageUrls;
};
/**
* Extracts the product attributes from the given Cheerio object.
* - We have to iterate over the attribute rows and extract both label and value for each row.
*/
const extractProductAttributes = ($: CheerioAPI): ProductAttribute[] => {
const attributeRowEls = $(SELECTORS.PRODUCT_ATTRIBUTE_ROWS).get();
const attributeRows = attributeRowEls.map((rowEl) => {
const label = $(rowEl).find(SELECTORS.ATTRIBUTES_LABEL).text();
const value = $(rowEl).find(SELECTORS.ATTRIBUTES_VALUE).text();
return { label, value };
});
return attributeRows;
};
/**
* Extracts the feature bullets from the given Cheerio object.
* - We have to iterate over the li elements and extract the text.
*/
const extractFeatureBullets = ($: CheerioAPI): string[] => {
const featureBullets = $(SELECTORS.FEATURE_BULLETS)
.map((_, featureBulletEl) => $(featureBulletEl).text().trim())
.get(); // `get()` - Retrieve all elements matched by the Cheerio object, as an array. Removes `undefined` values.
return featureBullets;
};
/**
* Scrapes the product details from the given Cheerio object.
*/
export const extractProductDetails = ($: CheerioAPI): ProductDetails => {
const title = $(SELECTORS.TITLE).text().trim();
const description = $(SELECTORS.DESCRIPTION).text()!.trim();
const price = parseNumberFromSelector($, SELECTORS.PRICE);
// const listPrice = parseNumberFromSelector($, SELECTORS.LIST_PRICE);
const reviewRating = parseNumberFromSelector($, SELECTORS.REVIEW_RATING);
const reviewCount = parseNumberFromSelector($, SELECTORS.REVIEW_COUNT);
const imageUrls = extractImageUrls($);
const attributes = extractProductAttributes($);
const featureBullets = extractFeatureBullets($);
return { title, description, featureBullets, price, /*listPrice,*/ reviewCount, reviewRating, imageUrls, attributes };
};