Untitled
unknown
plain_text
2 years ago
2.1 kB
4
Indexable
import re from nltk.tokenize import regexp_tokenize def extract_email_and_mobile(text): # Tokenize the text using regex tokens = regexp_tokenize(text, pattern=r'\s+', gaps=True) # Regular expression patterns for email and mobile numbers email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' mobile_pattern = r'\+\d{9,15}' # Initialize lists to store email and mobile matches emails = [] mobiles = [] for token in tokens: # Check if the token matches the email pattern email_match = re.match(email_pattern, token) if email_match: emails.append(email_match.group()) # Check if the token matches the mobile number pattern mobile_match = re.match(mobile_pattern, token) if mobile_match: mobiles.append(mobile_match.group()) return emails, mobiles if __name__ == "__main__": # Test the function with a sample input text_input = "Please contact me at john.doe@example.com or call 1234567890 for more information." extracted_emails, extracted_mobiles = extract_email_and_mobile(text_input) print("Extracted emails:", extracted_emails) print("Extracted mobile numbers:", extracted_mobiles) import re import tokenize def tokenize_paragraph(input_text): email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' phone_pattern = r'\+\d{9,15}' currency_pattern = r'₹\d+' # email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' mobile_pattern = r'\+\d{9,15}' combined_pattern = r'|'.join([email_pattern, phone_pattern, currency_pattern]) tokens = re.findall(combined_pattern + r'|\w+|[^\w\s]', input_text) return tokens inputText = "Hello This is Shubh Pujara and My Email id is shubh.pujara@somaiya.edu, My Phone number is +911234567890, My Bank Balance is ₹30000" outputText = tokenize_paragraph(inputText) extracted_emails, extracted_mobiles = extract_email_and_mobile(inputText) print("Extracted emails:", extracted_emails) print("Extracted mobile numbers:", extracted_mobiles) print("Total Tokenized Text", outputText)
Editor is loading...