Untitled

mail@pastecode.io avatar
unknown
python
2 years ago
1.3 kB
32
Indexable
Never
from gtts import gTTS
from requests import get
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from playsound import playsound
from time import sleep

def get_DOM_by_url(url):
    x = BeautifulSoup(get(url).content, 'lxml')
    
    yield x.find('h1').text

    for p in x.find('div', {'class':'post_content'})('p'):
        try:
            p['class']
        except KeyError:
            if p.text:
                yield p.text

def extract_sentencies(parsed):
    for p in parsed:
        for s in sent_tokenize(p):
            yield s
            
def to_speech(text_generator):
    n = 0
    for phrase in text_generator:
        try:
            s = gTTS(phrase, lang='ru', slow=False)
            print(n, phrase)
            s.save('/tmp/out.mp3')
            playsound('/tmp/out.mp3')
            if not n:
                sleep(1)
            else:
                sleep(0.1)
            n += 1
        except AssertionError:
            pass
        
        
if __name__ == '__main__':
    import sys
    try:
        url = sys.argv[1]
        print(f'Playing {url}...')
        to_speech(
            extract_sentencies(
                get_DOM_by_url(url)
            )
        )
    except IndexError:
        print(f'use {sys.argv[0]} http://google.com/', file=sys.stderr)