Untitled
unknown
plain_text
7 months ago
5.6 kB
3
Indexable
Never
from logging import DEBUG, INFO import requests import pubchempy as pcp from urllib.request import urlopen import mtranslate as gtrans import py2opsin as opsin import pandas as pd import pymysql from loguru import logger import argparse parser = argparse.ArgumentParser() args = parser.parse_args() debug_level = "DEBUG" logger.add("file_{time}.log", level=f"{debug_level}", rotation="100 MB") try: connection = pymysql.connect(user='cheminfo',password='dMA4HLPv55PjoE',host='localhost',database='cheminfo') mycursor=connection.cursor() ## connect to database except pymysql.err.OperationalError as e: logger.exception(e) raise SystemExit("Unable to connect to database") molimage = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{}/PNG?image_size=800x800" lines = 0 pubchem_pass = 0 def translate_name(translation): logger.debug("Swedish: " + translation) translated = gtrans.translate(translation,"en","sv") translated = translated.replace(" ","").replace("ethylamine","ethylamino") logger.debug("English: " + translated) return translated failed_file = open('failed.txt','a') list2 = open('list2.txt','a') with open('list.txt') as file: for swedish_name in file: lines += 1 swedish_name = swedish_name.replace("\n","") english_name = translate_name(swedish_name) opsin_smiles = opsin.py2opsin(english_name) logger.debug("Smiles: " + opsin_smiles) pubchem_iupac_name, pubchem_smiles = False, False pubchem = pd.DataFrame() iupac_name = english_name comp = pcp.get_compounds(iupac_name,'name') if not bool(comp) == False: pubchem = pcp.get_properties('Title,MolecularFormula,CanonicalSMILES,IsomericSMILES,InChI,InChIKey,IUPACName,ExactMass',iupac_name,'name',as_dataframe=True) if pubchem.empty == False: logger.debug("--- FOUND WITH NAME ---") pubchem = pubchem.head(1) pubchem_iupac_name = True else: pubchem_iupac_name = False else: if not opsin_smiles == "": comp = pcp.get_compounds(opsin_smiles,'smiles') if not bool(comp) == False: pubchem = pcp.get_properties('Title,MolecularFormula,CanonicalSMILES,IsomericSMILES,InChI,InChIKey,IUPACName,ExactMass',opsin_smiles,'smiles',as_dataframe=True) if pubchem.empty == False: logger.debug("--- FOUND WITH SMILES ---") pubchem = pubchem.head(1) pubchem_smiles = True else: pubchem_smiles = False else: pubchem_smiles = False else: pubchem_smiles = False pubchem_check = pubchem_iupac_name or pubchem_smiles if pubchem_check: pubchem_pass = pubchem_pass + 1 print("PubChem CID: " + str(pubchem.first_valid_index())) molurl = molimage.format(str(pubchem.first_valid_index())) moldata = requests.get(molurl) molimg = moldata.content if 'Title' in pubchem: logger.info("PubChem Title: " + pubchem.Title.item()) else: logger.info("PubChem Title: " + "None") logger.info("PubChem IUPACName: " + pubchem.IUPACName.item()) logger.info("PubChem CanonicalSMILES: " + pubchem.CanonicalSMILES.item()) logger.info("PubChem IsomericSMILES: " + pubchem.IsomericSMILES.item()) logger.info("PubChem MolecularFormula: " + pubchem.MolecularFormula.item()) logger.info("PubChem ExactMass: " + pubchem.ExactMass.item()) logger.info("PubChem InChI: " + pubchem.InChI.item()) logger.info("PubChem InChIKey: " + pubchem.InChIKey.item()) sql=""" INSERT INTO cheminfo (p_cid, title, iupac_name, canonical_smiles, isomeric_smiles, molecularformula, exactmass, inchi, inchikey, swedish_name, mol_image) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""" try: if 'Title' in pubchem: mycursor.execute(sql, (int(pubchem.first_valid_index()), pubchem.Title.item(), pubchem.IUPACName.item(),pubchem.CanonicalSMILES.item(),pubchem.IsomericSMILES.item(),pubchem.MolecularFormula.item(),pubchem.ExactMass.item(),pubchem.InChI.item(),pubchem.InChIKey.item(), swedish_name, molimg)) connection.commit() # save to database else: mycursor.execute(sql, (int(pubchem.first_valid_index()), "None", pubchem.IUPACName.item(),pubchem.CanonicalSMILES.item(),pubchem.IsomericSMILES.item(),pubchem.MolecularFormula.item(),pubchem.ExactMass.item(),pubchem.InChI.item(),pubchem.InChIKey.item(),swedish_name,molimg)) connection.commit() # save to database except pymysql.err.IntegrityError: logger.debug("--- ALREADY IN DB ---") logger.info("--- INFO END ---") else: logger.info("--- NO INFO FOUND ---") failed_file.write("pubchem failed swedish_name: " + swedish_name + "\n") failed_file.write("pubchem failed english_name: " + english_name + "\n") list2.write(english_name + "\n") connection.close() logger.debug("pubchem found: " + str(pubchem_pass) + " / " + str(lines))