Untitled
unknown
plain_text
2 years ago
899 B
9
Indexable
import re
import tokenize
def tokenize_paragraph(input_text):
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
phone_pattern = r'\+\d{9,15}'
currency_pattern = r'₹\d+'
# email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
mobile_pattern = r'\+\d{9,15}'
combined_pattern = r'|'.join([email_pattern, phone_pattern, currency_pattern])
tokens = re.findall(combined_pattern + r'|\w+|[^\w\s]', input_text)
return tokens
inputText = "Hello This is Shubh Pujara and My Email id is shubh.pujara@somaiya.edu, My Phone number is +911234567890, My Bank Balance is ₹30000"
outputText = tokenize_paragraph(inputText)
extracted_emails, extracted_mobiles = extract_email_and_mobile(inputText)
print("Extracted emails:", extracted_emails)
print("Extracted mobile numbers:", extracted_mobiles)
print("Total Tokenized Text", outputText)Editor is loading...