import os
import boto3
import requests
import json
import csv
csvFileLocation = "./c85f738c-9b4e-4ba7-8ffd-0bab0091d5f6.csv"
outputTranscriptsLocation = 'transcriptsCoverage3/'
transcriptURL = 'https://m.media-amazon.com/images/S/vse-vms-closed-captions-artifact-us-east-1-prod/transcripts/'
with open(csvFileLocation, 'r') as file:
csvreader = csv.reader(file)
i = 0
for row in csvreader:
i = i + 1
if i == 1:
continue
print(row[6].split('/')[-1].split('.')[0])
requestId = row[6].split('/')[-1].split('.')[0]
url = transcriptURL + requestId + '.json'
try:
data = requests.get(url)
jsonFile = data.json()
location = outputTranscriptsLocation + requestId + '.json'
with open(location, 'w') as f:
json.dump(jsonFile, f)
except requests.exceptions.RequestException as e:
continue
# raise SystemExit(e)
# path_to_json = 'transcriptsTest/'
path_to_json = outputTranscriptsLocation
l = []
allDict = []
fileCount = 0
check = {}
with open(csvFileLocation, 'r') as file:
csvreader = csv.reader(file)
requestId = ''
contentId = ''
i = 0
for row in csvreader:
i = i + 1
if i == 1:
continue
print(row[6].split('/')[-1].split('.')[0])
requestId = row[6].split('/')[-1].split('.')[0]
contentId = row[0]
print(contentId)
check[requestId] = contentId
for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]:
with open(path_to_json + file_name) as json_file:
fileCount += 1
fileData = {}
data = json.load(json_file)
print(file_name)
fileData['name'] = file_name
n = 0
accuracy = 0
for obj in data['results']['items']:
if obj['type'] != 'punctuation':
n = n + 1
accuracy += float(obj['alternatives'][0]['confidence'])
accuracy = accuracy / n
print(accuracy)
fileData['accuracy'] = accuracy
fileData['contentId'] = check[file_name.split('.')[0]]
l.append(accuracy)
currentStart = -1
currentEnd = -1
currentSentence = ""
currentTime = -1
sentences = []
for obj in data['results']['items']:
if currentStart == -1:
currentStart = obj['start_time']
word = obj['alternatives'][0]['content']
if obj['type'] != 'punctuation':
currentSentence += ' '
currentSentence += word
if obj['type'] == 'punctuation':
if word == '.' or word == '?':
currentEnd = currentTime
# add object to list
# print(currentSentence + " \n" + "Start : " +
# currentStart + " End : " + currentEnd)
# translate = boto3.client(
# service_name='translate', region_name='us-west-2', use_ssl=True)
# result = translate.translate_text(Text=currentSentence,
# SourceLanguageCode="en", TargetLanguageCode="fr-CA")
# print('TranslatedText: ' + result.get('TranslatedText'))
# print('SourceLanguageCode: ' + result.get('SourceLanguageCode'))
# print('TargetLanguageCode: ' + result.get('TargetLanguageCode'))
dictObj = {}
dictObj['startTime'] = currentStart
dictObj['endTime'] = currentEnd
dictObj['text'] = currentSentence
dictObj['TranslatedText'] = ""
# result.get('TranslatedText')
sentences.append(dictObj)
currentStart = -1
currentEnd = -1
currentSentence = ""
else:
currentTime = obj['end_time']
fileData['sentences'] = sentences
allDict.append(fileData)
l.sort(reverse=True)
print(l)
print("No. of jsons with accuracy with 'accuracy' > '85%' out of 1000 " + fileCount)
# allDict = sorted(allDict, key=lambda x: x['accuracy'], reverse=True)
# print(allDict)
# jsonData = json.dumps(allDict)
# with open('outputList.json', 'w') as f:
# json.dump(jsonData, f)
src/CCTranslationsPOC/transcriptsToSentencesJsons.py
Lines 1 to 128 (Context lines: 5, 20, 100)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import json
import boto3
import requests
import json
import csv
path_to_json = 'transcripts2/'
l = []
allDict = []
check = {}
with open("./3b1f7bc4-49b4-4855-9cf8-a81c093573fb.csv", 'r') as file:
csvreader = csv.reader(file)
requestId = ''
contentId = ''
i = 0
for row in csvreader:
i = i + 1
if i == 1:
continue
print(row[6].split('/')[-1].split('.')[0])
requestId = row[6].split('/')[-1].split('.')[0]
contentId = row[0]
print(contentId)
check[requestId] = contentId
for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]:
with open(path_to_json + file_name) as json_file:
fileData = {}
data = json.load(json_file)
print(file_name)
fileData['name'] = file_name
n = 0
accuracy = 0
# print(data['results']['transcripts'][0]['transcript'])
# print(data['results']['items'][0]['start_time'])
for obj in data['results']['items']:
if obj['type'] != 'punctuation':
n = n + 1
accuracy += float(obj['alternatives'][0]['confidence'])
accuracy = accuracy / n
print(accuracy)
fileData['accuracy'] = accuracy
fileData['contentId'] = check[file_name.split('.')[0]]
l.append(accuracy)
currentStart = -1
currentEnd = -1
currentSentence = ""
currentTime = -1
sentences = []
for obj in data['results']['items']:
if currentStart == -1:
currentStart = obj['start_time']
word = obj['alternatives'][0]['content']
if obj['type'] != 'punctuation':
currentSentence += ' '
currentSentence += word
if obj['type'] == 'punctuation':
if word == '.' or word == '?':
currentEnd = currentTime
# add object to list
# print(currentSentence + " \n" + "Start : " +
# currentStart + " End : " + currentEnd)
# translate = boto3.client(
# service_name='translate', region_name='us-west-2', use_ssl=True)
# result = translate.translate_text(Text=currentSentence,
# SourceLanguageCode="en", TargetLanguageCode="fr-CA")
# print('TranslatedText: ' + result.get('TranslatedText'))
# print('SourceLanguageCode: ' + result.get('SourceLanguageCode'))
# print('TargetLanguageCode: ' + result.get('TargetLanguageCode'))
dictObj = {}
dictObj['startTime'] = currentStart
dictObj['endTime'] = currentEnd
dictObj['text'] = currentSentence
dictObj['TranslatedText'] = ""
# result.get('TranslatedText')
sentences.append(dictObj)
currentStart = -1
currentEnd = -1
currentSentence = ""
else:
currentTime = obj['end_time']
fileData['sentences'] = sentences
allDict.append(fileData)
l.sort(reverse=True)
print(l)
allDict = sorted(allDict, key=lambda x: x['accuracy'], reverse=True)
print(allDict)
jsonData = json.dumps(allDict)
with open('outputList.json', 'w') as f:
json.dump(jsonData, f)