Untitled

 avatar
unknown
javascript
2 years ago
18 kB
19
Indexable
const puppeteer = require("puppeteer");
const fs = require("fs");
const axios = require("axios");

function run() {
  return new Promise(async (resolve, reject) => {
    try {
      // GETTING IN
      const browser = await puppeteer.launch({
        // headless: false,
        defaultViewport: false,
        args: [
          "--no-sandbox",
          "--disable-setuid-sandbox",
          "--disable-dev-shm-usage",
          "--disable-accelerated-2d-canvas",
          "--no-first-run",
          "--no-zygote",
          "--single-process",
          "--disable-gpu",
          "--ignore-certificate-errors",
        ],
      });

      const page = await browser.newPage();
      const client = await page.target().createCDPSession();
      await page.setCacheEnabled(true);
      const { width, height } = {
        width: 2300,
        // define height max
        height: 6000,
      };
      await page.setViewport({ width, height });
      await page.goto("https://www.amazon.ae/deals?ref_=nav_cs_gb", {
        waitUntil: "networkidle2",
        timeout: 900000,
      });
      // await page.waitForNavigation()
      // SITE LOADED

      // GET ALL CATEGORY DETAILS

      let categories = await page.$$eval(
        "#anonCarousel1 > ol > li > a",
        (items) => {
          return items.map((item) => {
            return {
              // select last span
              name: item.querySelector("span:last-of-type").innerText,
              link: item.href,
              dataTestId: item.getAttribute("data-testid"),
              image_url: item.querySelector(
                "span.GridPresets-module__gridPresetImageSection_2p68sRHExZZwCJorBe2_N3 > img"
              ).src,
              products: [],
            };
          });
        }
      );

      categories.shift();

      let isOnMainPage = true;
      let products = [];

      // GOING TO SINGLE CATEGORY DETAIL PAGE
      for (let i = 18; i < categories.length; i++) {
        products = [];

        console.log("category in:  ", i, " -- isOnMainPage: ", isOnMainPage);
        // ENSURING WETHER WE ARE IN MAIN PAGE OR NOT
        if (!isOnMainPage) {
          try {
            await page.goto("https://www.amazon.ae/deals?ref_=nav_cs_gb", {
              waitUntil: "load",
              timeout: 900000,
            });

            await page.waitForNavigation();
            isOnMainPage = true;
          } catch (e) {
            fs.appendFileSync("amazon-error.log", e + "\n");
            continue;
          }
        }

        console.log("isOnMainPage finally: ", isOnMainPage);

        // GOING CATEGORY LINK
        try {
          await page.click(`[data-testid="${categories[i]["dataTestId"]}"]`, {
            waitUntil: "load",
            timeout: 900000,
          });

          await page.waitForNavigation();
        } catch (e) {
          fs.appendFileSync("amazon-error.log", e + "\n");
          continue;
        }

        // HERE WAITING FOR SELECTOR TO LOAD
        try {
          await page.waitForSelector(
            "#grid-main-container > div.a-row.Grid-module__gridSection_1SEJTeTsU88s6aVeuuekAp > div > div",
            {
              visible: true,
              timeout: 900000,
            }
          );
        } catch (e) {
          fs.appendFileSync("amazon-error.log", e + "\n");
          continue;
        }

        // CHECK WHETHER THE CATEGORY HAS NO PRODUCT
        const isEmptyProduct = await page.$('div[role="note"]');
        if (isEmptyProduct) {
          continue;
        }

        let isNextBtnVisible = true;
        while (isNextBtnVisible) {
          // THIS IS THE PRODUCT ROOT DIV NODE WHICH WILL CONTAIN ALL PRODUCT
          let rootProducts = await page.evaluate(async (el) => {
            // PARENT DIV
            let productElements = await Array.from(
              document.querySelectorAll(
                "#grid-main-container > div.a-row.Grid-module__gridSection_1SEJTeTsU88s6aVeuuekAp > div > div"
              )
            );

            let data = productElements.map((el) => {
              // WE WILL SCRAPE INDIVIDUAL PRODUCT CART DETAILS FROM HERE
              let pName = el.querySelector(
                "div.DealGridItem-module__dealItemContent_1vFddcq1F8pUxM8dd9FW32 > div > div > a:nth-child(3) > div"
              );

              let offer = el.querySelector(
                "div > div > div > a.a-size-mini.a-link-normal.DealLink-module__dealLink_3v4tPYOP4qJj9bdiy0xAT.a-color-base.a-text-normal > div:nth-child(1) > div"
              );

              let link = el.querySelector(
                "div.DealGridItem-module__dealItemContent_1vFddcq1F8pUxM8dd9FW32 > div > div > a:nth-child(3)"
              );

              let price = el.querySelector(
                "div > div > div > span > span > span:nth-child(2) > span.a-price-whole"
              );

              let image = el.querySelector("div > div > a > div > div > img");

              return {
                name: pName ? pName.textContent : "",
                image: image
                  ? {
                      image_url: image.src,
                    }
                  : "",
                offer: offer ? offer.textContent : "",
                link: link ? link.href : "",
                price: price ? price.textContent : "",
              };
            });

            return data;
          });

          // SUB PRODUCTS STARTS FROM HERE
          for (j = 0; j < rootProducts.length; j++) {
            rootProducts[j].subProducts = [];

            let hasMoved = false;

            try {
              await page.goto(rootProducts[j].link, {
                waitUntil: "load",
                timeout: 900000,
              });
              hasMoved = true;
            } catch (e) {
              fs.appendFileSync("amazon-error.log", e + "\n");
              continue;
            }

            let title = await page.$("#productTitle");

            // HERE WE ARE DIFFERENTIATING PAGE BY REALIZING TITLE ID SELECTOR
            if (title) {
              // THIS PAGE WILL HAVE DIRECT SINGLE PRODUCT DETAILS
              try {
                let item = await getSingleProductDetails();
                if (item) products = [...products, item];
              } catch (e) {
                fs.appendFileSync("amazon-error.log", e + "\n");
                continue;
              }
            }
            // else {
            //   // THIS PAGE WILL HAVE MULTIPLE PRODUCT DETAILS(SUBPRODUCTS)
            //   try {
            //     let items = await getSubProductDetails();

            //     if (items?.length) {
            //       products  = [...products,...items];
            //     }
            //   } catch (e) {
            //     fs.appendFileSync("amazon-error.log", e + "\n");
            //     continue;
            //   }
            // }

            if (hasMoved) {
              await page.goBack();
            }
          }

          // GOING BACK TO MAIN CATEGORY PAGE
          try {
            await page.goto("https://www.amazon.ae/deals?ref_=nav_cs_gb", {
              waitUntil: "load",
              timeout: 900000,
            });
            isOnMainPage = true;
          } catch (e) {
            isOnMainPage = false;
            fs.appendFileSync("amazon-error.log", e + "\n");
            break;
          }

          // -----------HANDLING NEXT BUTTON-------------
          await page.waitForSelector("li.a-last", {
            visible: true,
            timeout: 900000,
          });
          isNextBtnVisible = (await page.$("li.a-disabled.a-last")) === null;
          if (!isNextBtnVisible) {
            break;
          }
          let nextBtn = await page.$("li.a-last");
          if (nextBtn) {
            await nextBtn.click();
            await page.waitForNavigation({
              waitUntil: "load",
              timeout: 900000,
            });
          } else break;
          // -----------HANDLING NEXT BUTTON-------------
        }

        categories[i]["products"] = products;
        console.log("cat " + i + " done");
      }

      // THIS FUNCTION IS FOR SCRAPING SINGLE PRODUCT DETAILS
      async function getSingleProductDetails() {
        let subProduct = await page.evaluate(async () => {
          let sTitle = await document.querySelector("#productTitle");
          let sOffer = await document.querySelector(
            "#corePriceDisplay_desktop_feature_div > div.a-section.a-spacing-none.aok-align-center > span.a-size-large.a-color-price.savingPriceOverride.aok-align-center.reinventPriceSavingsPercentageMargin.savingsPercentage"
          );
          let sCuPrice = await document.querySelector(
            "#corePriceDisplay_desktop_feature_div > div.a-section.a-spacing-none.aok-align-center > span.a-price.aok-align-center.reinventPricePriceToPayMargin.priceToPay > span:nth-child(2) > span.a-price-whole"
          );
          let sCuPriceFra = await document.querySelector(
            "#corePriceDisplay_desktop_feature_div > div.a-section.a-spacing-none.aok-align-center > span.a-price.aok-align-center.reinventPricePriceToPayMargin.priceToPay > span:nth-child(2) > span.a-price-fraction"
          );
          let currency = await document.querySelector(
            "#corePriceDisplay_desktop_feature_div > div.a-section.a-spacing-none.aok-align-center > span.a-price.aok-align-center.reinventPricePriceToPayMargin.priceToPay > span:nth-child(2) > span.a-price-symbol"
          );
          let oPrice = await document.querySelector(
            "#corePriceDisplay_desktop_feature_div > div.a-section.a-spacing-small.aok-align-center > span > span.a-size-small.a-color-secondary.aok-align-center.basisPrice > span > span.a-offscreen"
          );

          let pColor = await document.querySelector(
            "#variation_color_name > div > span"
          );

          let details = await Array.from(
            document.querySelectorAll(
              "#productOverview_feature_div > div > table > tbody > tr"
            )
          );
          let detailsData = details.map(async (el) => {
            // await el.waitForSelector("td.a-span3 > span", {visible: true, timeout: 900000})
            let title = await el.querySelector("td.a-span3 > span");
            let description = await el.querySelector("td.a-span9 > span");

            return {
              title: title ? title.innerText : "",
              description: description ? description.innerText : "",
            };
          });

          let image = await document.querySelector("#landingImage");

          let summaries = await Array.from(
            document.querySelectorAll("#feature-bullets > ul > li")
          ).map((item) => {
            description: item.querySelector("span").textContent;
          });

          let sRating = await document.querySelector(
            "#acrPopover > span.a-declarative > a > i.a-icon.a-icon-star > span"
          );

          let rating = {
            rating: "",
            rated_out_of: "",
          };
          if (sRating) {
            sRating = sRating.textContent;
            let ratingValeus = sRating.split(" ");
            rating.rating = parseFloat(ratingValeus[0]);
            rating.rated_out_of = parseFloat(ratingValeus[3]);
          }

          return {
            title: sTitle ? sTitle.textContent : "",
            rating: rating,
            offer: sOffer ? sOffer.textContent : "",
            currentPrice: sCuPrice
              ? sCuPrice.textContent + sCuPriceFra.textContent
              : "",
            oldPrice: oPrice
              ? parseFloat(oPrice.textContent.replace(currency.textContent, ""))
              : "",
            currency: currency ? currency.textContent : "",
            details: detailsData,
            color: pColor ? pColor.textContent : "",
            summary: summaries,
            images: image
              ? {
                  image_url: image.src,
                }
              : "",
          };
        });

        return subProduct;
      }

      // THIS FUNCTION WILL SRAPE PRODUCT DATA WHICH WILL HAVE MULTIPLE SUB PRODUCT
      async function getSubProductDetails() {
        let subProducts = await page.evaluate(async () => {
          let items = await Array.from(
            document.querySelectorAll("#octopus-dlp-asin-stream > ul > li")
          );

          let data = [];
          items.forEach(async (el) => {
            let name = await el.querySelector(
              "span > div > div.a-section.octopus-dlp-asin-info-section > div.a-section.octopus-dlp-asin-title > a"
            );
            let link = name ? name.href : "";
            name = name ? name.innerText : "";

            let image = await el.querySelector(
              "span > div > div.a-section.a-spacing-base.a-text-center.octopus-dlp-image-section > a > img"
            );
            image = image ? image.src : "";

            // OFFER STARTS
            let offerLeft = await el.querySelector(
              "span > div > div.a-section.octopus-dlp-asin-info-section > div.a-row.a-spacing-mini.a-grid-vertical-align.a-grid-center > div > div.a-size-mini.oct-deal-badge-element.oct-deal-badge-label > span:nth-child(1)"
            );
            let offerRight = await el.querySelector(
              "span > div > div.a-section.octopus-dlp-asin-info-section > div.a-row.a-spacing-mini.a-grid-vertical-align.a-grid-center > div > div.a-size-mini.oct-deal-badge-element.oct-deal-badge-label > span:nth-child(2)"
            );

            let offer = "";
            if (offerLeft && offerRight) {
              offer = offerLeft.textContent + offerRight.textContent;
            }
            // OFFER ENDS

            let currency = await el.querySelector(
              "span > div > div.a-section.octopus-dlp-asin-info-section > div.a-row.octopus-dlp-price > span.a-price.octopus-widget-price > span:nth-child(2) > span.a-price-symbol"
            );
            currency = currency ? currency.textContent : "";

            let cuPrice = await el.querySelector(
              "span > div > div.a-section.octopus-dlp-asin-info-section > div.a-row.octopus-dlp-price > span.a-price.octopus-widget-price > span:nth-child(2) > span.a-price-whole"
            );
            cuPrice = cuPrice ? cuPrice.textContent : "";

            let cuPriceFraction = await el.querySelector(
              "span > div > div.a-section.octopus-dlp-asin-info-section > div.a-row.octopus-dlp-price > span.a-price.octopus-widget-price > span:nth-child(2) > span.a-price-fraction"
            );
            cuPriceFraction = cuPriceFraction
              ? cuPriceFraction.textContent
              : "";

            let oldPrice = await el.querySelector(
              "span > div > div.a-section.octopus-dlp-asin-info-section > div.a-row.octopus-dlp-price > span.octopus-widget-price-saving-info > span.a-size-mini.a-color-tertiary.octopus-widget-strike-through-price.a-text-strike"
            );
            oldPrice = oldPrice
              ? parseFloat(
                  oldPrice.textContent.replace(currency.textContent, "")
                )
              : "";

            // RATING STARTS
            let sRating = await el.querySelector(
              " span > div > div.a-section.octopus-dlp-asin-info-section > div:nth-child(2) > i > span"
            );
            let rating = {
              rating: "",
              rated_out_of: "",
            };
            if (sRating) {
              sRating = sRating.textContent;
              let ratingValeus = sRating.split(" ");
              rating.rating = parseFloat(ratingValeus[0]);
              rating.rated_out_of = parseFloat(ratingValeus[3]);
            }
            // RATING ENDS
            data.push({
              link: link,
              name: name,
              images: {
                image_url: image,
              },
              offer: offer,
              currency: currency,

              currentPrice: cuPrice + cuPriceFraction,
              oldPrice: oldPrice,
              rating: rating,
            });
          });

          return data;
        });

        let hasMoved = false;

        for (let i = 0; i < subProducts.length; i++) {
          hasMoved = false;
          try {
            await page.goto(subProducts[i].link, {
              waitUntil: "load",
              timeout: 900000,
            });

            hasMoved = true;
          } catch (e) {
            fs.appendFileSync("error.log", e + "\n");
            continue;
          }

          let newProduct = [];
          try {
            let item = await getSingleProductDetails();

            if (item) {
              newProduct.push(item);
            }
          } catch (e) {
            fs.appendFileSync("amazon-error.log", e + "\n");
            continue;
          }

          subProducts[i]["subProducts"] = [...newProduct];

          if (hasMoved) {
            await page.goBack();
          }
        }

        return subProducts;
      }

      await page.close();
      await browser.close();

      // SCRAPING ENDED HERE

      let items = {
        url: "https://www.amazon.ae",
        categories: [...categories],
      };

      fs.writeFileSync("amazon.json", JSON.stringify(items));


      return resolve();
    } catch (e) {
      console.log("errror --------- ", e);
      fs.appendFileSync("amazon-error.log", e + "\n");
      return reject(e);
    }
  });
}
run();
Editor is loading...