Untitled

 avatar
unknown
plain_text
2 years ago
2.1 kB
4
Indexable
import re
from nltk.tokenize import regexp_tokenize

def extract_email_and_mobile(text):
    # Tokenize the text using regex
    tokens = regexp_tokenize(text, pattern=r'\s+', gaps=True)

    # Regular expression patterns for email and mobile numbers
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    mobile_pattern = r'\+\d{9,15}'

    # Initialize lists to store email and mobile matches
    emails = []
    mobiles = []

    for token in tokens:
        # Check if the token matches the email pattern
        email_match = re.match(email_pattern, token)
        if email_match:
            emails.append(email_match.group())

        # Check if the token matches the mobile number pattern
        mobile_match = re.match(mobile_pattern, token)
        if mobile_match:
            mobiles.append(mobile_match.group())

    return emails, mobiles

if __name__ == "__main__":
    # Test the function with a sample input
    text_input = "Please contact me at john.doe@example.com or call 1234567890 for more information."
    extracted_emails, extracted_mobiles = extract_email_and_mobile(text_input)
    print("Extracted emails:", extracted_emails)
    print("Extracted mobile numbers:", extracted_mobiles)









import re
import tokenize

def tokenize_paragraph(input_text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    phone_pattern = r'\+\d{9,15}'
    currency_pattern = r'₹\d+'
    # email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    mobile_pattern = r'\+\d{9,15}'

    combined_pattern = r'|'.join([email_pattern, phone_pattern, currency_pattern])

    tokens = re.findall(combined_pattern + r'|\w+|[^\w\s]', input_text)

    return tokens

inputText = "Hello This is Shubh Pujara and My Email id is shubh.pujara@somaiya.edu, My Phone number is +911234567890, My Bank Balance is ₹30000"
outputText = tokenize_paragraph(inputText)
extracted_emails, extracted_mobiles = extract_email_and_mobile(inputText)
print("Extracted emails:", extracted_emails)
print("Extracted mobile numbers:", extracted_mobiles)
print("Total Tokenized Text", outputText)
Editor is loading...