Untitled

mail@pastecode.io avatar
unknown
python
a year ago
6.3 kB
2
Indexable
Never
from selenium import webdriver
from urllib import parse
from time import sleep


class GetBibs():
    def __init__(self, driver_path, option_path, ie_search_url, gg_search_url) -> None:
        self.ie_search_url = ie_search_url
        self.gg_search_url = gg_search_url
        # 启用带插件的浏览器
        option = webdriver.ChromeOptions()
        option.add_argument("--user-data-dir=" + option_path)
        self.browser = webdriver.Chrome(executable_path=driver_path, options=option)  # 打开chrome浏览器
        self.browser.set_window_size(800, 800)  # 不要随意修改,太窄会导致按钮的隐藏,模拟点击失效

    def get_bib_from_IEEE(self, paper_title):
        strto_pn = parse.quote(paper_title)
        url = self.ie_search_url + strto_pn
        self.browser.get(url)
        compare_title = ''.join(list(filter(str.isalnum, paper_title))).lower()
        # 等待词条加载
        for i in range(100):
            try:
                elements = self.browser.find_elements_by_css_selector("[class='List-results-items']")
                elements[0].get_attribute('id')
                break
            except:
                sleep(0.1)
        # 扫描所有词条,是否存在所需文献
        paper_url = r'https://ieeexplore.ieee.org/document/'
        for i in elements:
            s_title = i.text.split('\n')[0]
            s_title = ''.join(list(filter(str.isalnum, s_title))).lower()
            if s_title == compare_title:
                paper_url += i.get_attribute('id')
                break
        if paper_url == r'https://ieeexplore.ieee.org/document/':  # 没找到
            return ''
        # 进入文献页面
        self.browser.get(paper_url)
        # 等待加载bib按钮
        for i in range(100):
            try:
                element = self.browser.find_element_by_css_selector("[class='layout-btn-white cite-this-btn']")
                element.click()
                break
            except:
                sleep(0.1)
        # 点击bibtex
        for i in range(100):
            try:
                element = self.browser.find_element_by_css_selector("[class='modal-dialog']")
                element = element.find_elements_by_css_selector("[class='document-tab-link']")[1]
                element.click()
                break
            except:
                sleep(0.1)
        for i in range(100):
            try:
                self.browser.find_element_by_css_selector("[class='text ris-text']")
                break
            except:
                sleep(0.1)
        sleep(2)
        bib = self.browser.find_element_by_css_selector("[class='text ris-text']").text
        return bib

    def get_bib_from_google_scholar(self, paper_title):
        strto_pn = parse.quote(paper_title)
        url = self.gg_search_url + strto_pn
        self.browser.get(url)
        # 等待词条加载
        for i in range(100):
            try:
                # element = self.browser.find_element_by_css_selector("[class='gs_r gs_or gs_scl']")
                # element = element.find_element_by_css_selector("[class='gs_fl']")
                # element = element.find_element_by_css_selector("[class='gs_or_cit gs_nph']")
                element = self.browser.find_element('css selector', '.gs_r.gs_or.gs_scl')
                element = element.find_element('css selector', '.gs_fl')
                element = element.find_element('css selector', '.gs_or_cit.gs_nph')
                element.click()
                break
            except:
                sleep(0.1)
        for i in range(100):
            try:
                # element = self.browser.find_element_by_id("gs_citi")
                # element = element.find_element_by_css_selector("[class='gs_citi']")
                element = self.browser.find_element('id', 'gs_citi')
                element = element.find_element('css selector', '.gs_citi')
                element.click()
                break
            except:
                sleep(0.1)
        for i in range(100):
            try:
                # bib = self.browser.find_element_by_tag_name('pre').text
                bib = self.browser.find_element('tag name', 'pre').text
                break
            except:
                sleep(0.1)
        return bib

    def get_bib(self, paper_title):
        # bib = self.get_bib_from_IEEE(paper_title)
        # if bib != '':
        #     return "IEEE", bib
        return "Google", self.get_bib_from_google_scholar(paper_title)


driver_path = r'C:/Users/admin/Desktop/chromedriver'  # 浏览器驱动位置
option_path = r"C:/Users/Administrator/AppData/Local/Google/Chrome/User Data/"  # 使浏览器能用你自定义的设置,否则Selenium创建的浏览器对象是默认设置,一些插件就不能用了
ie_search_url = r'https://ieeexplore.ieee.org/search/searchresult.jsp?newsearch=true&queryText='  # 在执行代码之前,先打开IEEE官网的搜索页面,把类似的网址复制到这里,等号=后面就是一会儿要搜索的内容
gg_search_url = r'https://scholar.google.com/scholar?hl=zh-CN&as_sdt=0%2C5&inst=1597255436240989024&q='  # 谷歌学术也是一样
get_bibs = GetBibs(driver_path, option_path, ie_search_url, gg_search_url)
# %% **********************以上定义爬虫对象,以下开始爬取*******************************
paper_titles = {  # 要爬取的论文,key用于标记,value是论文题目。下面是一些样例
    "ESPCN": 'Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network',
    "Sparse_Coding": 'Image Super-Resolution Via Sparse Representation',
    "ESRGAN": 'ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks',
    "EnhanceNet": 'EnhanceNet: Single Image Super-Resolution Through Automated Texture Synthesis',
    'Meta-SR': 'Meta-SR: A Magnification-Arbitrary Network for Super-Resolution',
    'SAN': 'Second-Order Attention Network for Single Image Super-Resolution',
}

for k in paper_titles.keys():
    source, bib = get_bibs.get_bib(paper_titles[k])
    print(source + ":", k)
    print(bib)
    print()