nord vpnnord vpn
Ad

Untitled

mail@pastecode.io avatar
unknown
plain_text
7 months ago
5.6 kB
3
Indexable
Never
from logging import DEBUG, INFO
import requests
import pubchempy as pcp
from urllib.request import urlopen
import mtranslate as gtrans
import py2opsin as opsin
import pandas as pd
import pymysql
from loguru import logger
import argparse

parser = argparse.ArgumentParser()
args = parser.parse_args()

debug_level = "DEBUG"
logger.add("file_{time}.log", level=f"{debug_level}", rotation="100 MB")

try:
    connection = pymysql.connect(user='cheminfo',password='dMA4HLPv55PjoE',host='localhost',database='cheminfo')
    mycursor=connection.cursor()    ## connect to database
except pymysql.err.OperationalError as e:
    logger.exception(e)
    raise SystemExit("Unable to connect to database")

molimage = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{}/PNG?image_size=800x800"

lines = 0
pubchem_pass = 0

def translate_name(translation):
    logger.debug("Swedish: " + translation)
    translated = gtrans.translate(translation,"en","sv")
    translated = translated.replace(" ","").replace("ethylamine","ethylamino")
    logger.debug("English: " + translated)
    return translated

failed_file = open('failed.txt','a')
list2 = open('list2.txt','a')
with open('list.txt') as file:
   for swedish_name in file:
        lines += 1
        swedish_name = swedish_name.replace("\n","")
        english_name = translate_name(swedish_name)
        opsin_smiles = opsin.py2opsin(english_name)
        logger.debug("Smiles: " + opsin_smiles)
        pubchem_iupac_name, pubchem_smiles = False, False
        pubchem = pd.DataFrame()
        iupac_name = english_name

        comp = pcp.get_compounds(iupac_name,'name')
        if not bool(comp) == False:
            pubchem = pcp.get_properties('Title,MolecularFormula,CanonicalSMILES,IsomericSMILES,InChI,InChIKey,IUPACName,ExactMass',iupac_name,'name',as_dataframe=True)
            if pubchem.empty == False:
                logger.debug("--- FOUND WITH NAME ---")
                pubchem = pubchem.head(1)
                pubchem_iupac_name = True
            else:
                pubchem_iupac_name = False
        else:
            if not opsin_smiles == "":
                comp = pcp.get_compounds(opsin_smiles,'smiles')
                if not bool(comp) == False:
                    pubchem = pcp.get_properties('Title,MolecularFormula,CanonicalSMILES,IsomericSMILES,InChI,InChIKey,IUPACName,ExactMass',opsin_smiles,'smiles',as_dataframe=True)
                    if pubchem.empty == False:
                        logger.debug("--- FOUND WITH SMILES ---")
                        pubchem = pubchem.head(1)
                        pubchem_smiles = True
                    else:
                        pubchem_smiles = False
                else:
                    pubchem_smiles = False
            else:
                pubchem_smiles = False

        pubchem_check = pubchem_iupac_name or pubchem_smiles
        if pubchem_check:
            pubchem_pass = pubchem_pass + 1
            print("PubChem CID: " + str(pubchem.first_valid_index()))
            molurl = molimage.format(str(pubchem.first_valid_index()))
            moldata = requests.get(molurl)
            molimg = moldata.content
            if 'Title' in pubchem:
                logger.info("PubChem Title: " + pubchem.Title.item())
            else:
                logger.info("PubChem Title: " + "None")
            logger.info("PubChem IUPACName: " + pubchem.IUPACName.item())                
            logger.info("PubChem CanonicalSMILES: " + pubchem.CanonicalSMILES.item())                
            logger.info("PubChem IsomericSMILES: " + pubchem.IsomericSMILES.item())                
            logger.info("PubChem MolecularFormula: " + pubchem.MolecularFormula.item())
            logger.info("PubChem ExactMass: " + pubchem.ExactMass.item())                
            logger.info("PubChem InChI: " + pubchem.InChI.item())                
            logger.info("PubChem InChIKey: " + pubchem.InChIKey.item())
            sql=""" INSERT INTO cheminfo (p_cid, title, iupac_name, canonical_smiles, isomeric_smiles, molecularformula, exactmass, inchi, inchikey, swedish_name, mol_image) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""                 
            try:
                if 'Title' in pubchem:
                    mycursor.execute(sql, (int(pubchem.first_valid_index()), pubchem.Title.item(), pubchem.IUPACName.item(),pubchem.CanonicalSMILES.item(),pubchem.IsomericSMILES.item(),pubchem.MolecularFormula.item(),pubchem.ExactMass.item(),pubchem.InChI.item(),pubchem.InChIKey.item(), swedish_name, molimg))
                    connection.commit()  # save to database
                else:
                    mycursor.execute(sql, (int(pubchem.first_valid_index()), "None", pubchem.IUPACName.item(),pubchem.CanonicalSMILES.item(),pubchem.IsomericSMILES.item(),pubchem.MolecularFormula.item(),pubchem.ExactMass.item(),pubchem.InChI.item(),pubchem.InChIKey.item(),swedish_name,molimg))
                    connection.commit()  # save to database
            except pymysql.err.IntegrityError:
                logger.debug("--- ALREADY IN DB ---")
            logger.info("--- INFO END ---")
            
        else:
            logger.info("--- NO INFO FOUND ---")
            failed_file.write("pubchem failed swedish_name: " + swedish_name + "\n")
            failed_file.write("pubchem failed english_name: " + english_name + "\n")
            list2.write(english_name + "\n")
            
connection.close()

logger.debug("pubchem found: " + str(pubchem_pass) + " / " + str(lines))

nord vpnnord vpn
Ad