Untitled
unknown
plain_text
a year ago
8.6 kB
3
Indexable
Never
import os import boto3 import requests import json import csv csvFileLocation = "./c85f738c-9b4e-4ba7-8ffd-0bab0091d5f6.csv" outputTranscriptsLocation = 'transcriptsCoverage3/' transcriptURL = 'https://m.media-amazon.com/images/S/vse-vms-closed-captions-artifact-us-east-1-prod/transcripts/' with open(csvFileLocation, 'r') as file: csvreader = csv.reader(file) i = 0 for row in csvreader: i = i + 1 if i == 1: continue print(row[6].split('/')[-1].split('.')[0]) requestId = row[6].split('/')[-1].split('.')[0] url = transcriptURL + requestId + '.json' try: data = requests.get(url) jsonFile = data.json() location = outputTranscriptsLocation + requestId + '.json' with open(location, 'w') as f: json.dump(jsonFile, f) except requests.exceptions.RequestException as e: continue # raise SystemExit(e) # path_to_json = 'transcriptsTest/' path_to_json = outputTranscriptsLocation l = [] allDict = [] fileCount = 0 check = {} with open(csvFileLocation, 'r') as file: csvreader = csv.reader(file) requestId = '' contentId = '' i = 0 for row in csvreader: i = i + 1 if i == 1: continue print(row[6].split('/')[-1].split('.')[0]) requestId = row[6].split('/')[-1].split('.')[0] contentId = row[0] print(contentId) check[requestId] = contentId for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]: with open(path_to_json + file_name) as json_file: fileCount += 1 fileData = {} data = json.load(json_file) print(file_name) fileData['name'] = file_name n = 0 accuracy = 0 for obj in data['results']['items']: if obj['type'] != 'punctuation': n = n + 1 accuracy += float(obj['alternatives'][0]['confidence']) accuracy = accuracy / n print(accuracy) fileData['accuracy'] = accuracy fileData['contentId'] = check[file_name.split('.')[0]] l.append(accuracy) currentStart = -1 currentEnd = -1 currentSentence = "" currentTime = -1 sentences = [] for obj in data['results']['items']: if currentStart == -1: currentStart = obj['start_time'] word = obj['alternatives'][0]['content'] if obj['type'] != 'punctuation': currentSentence += ' ' currentSentence += word if obj['type'] == 'punctuation': if word == '.' or word == '?': currentEnd = currentTime # add object to list # print(currentSentence + " \n" + "Start : " + # currentStart + " End : " + currentEnd) # translate = boto3.client( # service_name='translate', region_name='us-west-2', use_ssl=True) # result = translate.translate_text(Text=currentSentence, # SourceLanguageCode="en", TargetLanguageCode="fr-CA") # print('TranslatedText: ' + result.get('TranslatedText')) # print('SourceLanguageCode: ' + result.get('SourceLanguageCode')) # print('TargetLanguageCode: ' + result.get('TargetLanguageCode')) dictObj = {} dictObj['startTime'] = currentStart dictObj['endTime'] = currentEnd dictObj['text'] = currentSentence dictObj['TranslatedText'] = "" # result.get('TranslatedText') sentences.append(dictObj) currentStart = -1 currentEnd = -1 currentSentence = "" else: currentTime = obj['end_time'] fileData['sentences'] = sentences allDict.append(fileData) l.sort(reverse=True) print(l) print("No. of jsons with accuracy with 'accuracy' > '85%' out of 1000 " + fileCount) # allDict = sorted(allDict, key=lambda x: x['accuracy'], reverse=True) # print(allDict) # jsonData = json.dumps(allDict) # with open('outputList.json', 'w') as f: # json.dump(jsonData, f) src/CCTranslationsPOC/transcriptsToSentencesJsons.py Lines 1 to 128 (Context lines: 5, 20, 100) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 import os import json import boto3 import requests import json import csv path_to_json = 'transcripts2/' l = [] allDict = [] check = {} with open("./3b1f7bc4-49b4-4855-9cf8-a81c093573fb.csv", 'r') as file: csvreader = csv.reader(file) requestId = '' contentId = '' i = 0 for row in csvreader: i = i + 1 if i == 1: continue print(row[6].split('/')[-1].split('.')[0]) requestId = row[6].split('/')[-1].split('.')[0] contentId = row[0] print(contentId) check[requestId] = contentId for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]: with open(path_to_json + file_name) as json_file: fileData = {} data = json.load(json_file) print(file_name) fileData['name'] = file_name n = 0 accuracy = 0 # print(data['results']['transcripts'][0]['transcript']) # print(data['results']['items'][0]['start_time']) for obj in data['results']['items']: if obj['type'] != 'punctuation': n = n + 1 accuracy += float(obj['alternatives'][0]['confidence']) accuracy = accuracy / n print(accuracy) fileData['accuracy'] = accuracy fileData['contentId'] = check[file_name.split('.')[0]] l.append(accuracy) currentStart = -1 currentEnd = -1 currentSentence = "" currentTime = -1 sentences = [] for obj in data['results']['items']: if currentStart == -1: currentStart = obj['start_time'] word = obj['alternatives'][0]['content'] if obj['type'] != 'punctuation': currentSentence += ' ' currentSentence += word if obj['type'] == 'punctuation': if word == '.' or word == '?': currentEnd = currentTime # add object to list # print(currentSentence + " \n" + "Start : " + # currentStart + " End : " + currentEnd) # translate = boto3.client( # service_name='translate', region_name='us-west-2', use_ssl=True) # result = translate.translate_text(Text=currentSentence, # SourceLanguageCode="en", TargetLanguageCode="fr-CA") # print('TranslatedText: ' + result.get('TranslatedText')) # print('SourceLanguageCode: ' + result.get('SourceLanguageCode')) # print('TargetLanguageCode: ' + result.get('TargetLanguageCode')) dictObj = {} dictObj['startTime'] = currentStart dictObj['endTime'] = currentEnd dictObj['text'] = currentSentence dictObj['TranslatedText'] = "" # result.get('TranslatedText') sentences.append(dictObj) currentStart = -1 currentEnd = -1 currentSentence = "" else: currentTime = obj['end_time'] fileData['sentences'] = sentences allDict.append(fileData) l.sort(reverse=True) print(l) allDict = sorted(allDict, key=lambda x: x['accuracy'], reverse=True) print(allDict) jsonData = json.dumps(allDict) with open('outputList.json', 'w') as f: json.dump(jsonData, f)