scraper.js

 avatar
unknown
javascript
4 years ago
1.7 kB
3
Indexable
const cheerio = require("cheerio");
const superagent = require("superagent");
var axios = superagent.agent();

const siteUrl = "https://www.cermati.com";
const link = new Set();


const fetchData = async (link) => {
  const result = await axios.get(siteUrl + link);
  return cheerio.load(result.text);
};

const getResults = async () => {

  const $ = await fetchData('/artikel');

  $('div.list-of-articles').each((i, value) => {
    $(value).find('div.article-list-item').each((j, data) => {
      link.add($(data).find('a').attr('href'))
    });
  });
  return [...link].sort()
};

const getResultDetails = async (link) => {
  const $ = await fetchData(link);

  let detailContent = [];
  $('section.post-content').each((i, detail) => {
    const title = $(detail).find('h1.post-title').text().trim()
    const author = $(detail).find('span.author-name').text().trim()
    const postingDate = $(detail).find('span.post-date').children('span').text().trim()
    detailContent.push({
      "link" : siteUrl+link,
      title,
      author,
      postingDate
    })
  });
  $('div.col-lg-3 .margin-bottom-30').each((i, sidebar) => {
    if (i == 1) {
      const temp = {relatedArticles:[]};

      $(sidebar).find('ul.panel-items-list li').each((i, related) => {
        const urlRelated = siteUrl+$(related).find('li>a').attr('href');
        const titleRelated = $(related).find('li a').children('h5.item-title').text();
        temp.relatedArticles.push({
          "url" : urlRelated,
          "title" : titleRelated
        })
      });
      detailContent.push(temp)
    }
  });

  return detailContent;
}
module.exports = {
  getResults,
  getResultDetails
};
Editor is loading...