Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
8.6 kB
3
Indexable
Never
import os
import boto3
import requests
import json
import csv
 
 
csvFileLocation = "./c85f738c-9b4e-4ba7-8ffd-0bab0091d5f6.csv"
outputTranscriptsLocation = 'transcriptsCoverage3/'
 
 
transcriptURL = 'https://m.media-amazon.com/images/S/vse-vms-closed-captions-artifact-us-east-1-prod/transcripts/'
 
 
with open(csvFileLocation, 'r') as file:
    csvreader = csv.reader(file)
    i = 0
    for row in csvreader:
        i = i + 1
        if i == 1:
            continue
 
        print(row[6].split('/')[-1].split('.')[0])
        requestId = row[6].split('/')[-1].split('.')[0]
 
        url = transcriptURL + requestId + '.json'
        try:
            data = requests.get(url)
            jsonFile = data.json()
            location = outputTranscriptsLocation + requestId + '.json'
            with open(location, 'w') as f:
                json.dump(jsonFile, f)
        except requests.exceptions.RequestException as e:
            continue
            # raise SystemExit(e)
 
 
# path_to_json = 'transcriptsTest/'
path_to_json = outputTranscriptsLocation
 
l = []
allDict = []
 
fileCount = 0
 
check = {}
 
with open(csvFileLocation, 'r') as file:
    csvreader = csv.reader(file)
    requestId = ''
    contentId = ''
    i = 0
    for row in csvreader:
        i = i + 1
        if i == 1:
            continue
 
        print(row[6].split('/')[-1].split('.')[0])
        requestId = row[6].split('/')[-1].split('.')[0]
        contentId = row[0]
        print(contentId)
        check[requestId] = contentId
 
 
for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]:
    with open(path_to_json + file_name) as json_file:
        fileCount += 1
        fileData = {}
        data = json.load(json_file)
        print(file_name)
        fileData['name'] = file_name
        n = 0
        accuracy = 0
        for obj in data['results']['items']:
            if obj['type'] != 'punctuation':
                n = n + 1
            accuracy += float(obj['alternatives'][0]['confidence'])
 
        accuracy = accuracy / n
 
        print(accuracy)
 
        fileData['accuracy'] = accuracy
 
        fileData['contentId'] = check[file_name.split('.')[0]]
 
        l.append(accuracy)
 
        currentStart = -1
        currentEnd = -1
        currentSentence = ""
        currentTime = -1
 
        sentences = []
 
        for obj in data['results']['items']:
 
            if currentStart == -1:
                currentStart = obj['start_time']
 
            word = obj['alternatives'][0]['content']
 
            if obj['type'] != 'punctuation':
                currentSentence += ' '
 
            currentSentence += word
 
            if obj['type'] == 'punctuation':
                if word == '.' or word == '?':
                    currentEnd = currentTime
                    #  add object to list
 
                    # print(currentSentence + " \n" + "Start : " +
                    #       currentStart + " End : " + currentEnd)
 
                    # translate = boto3.client(
                    #     service_name='translate', region_name='us-west-2', use_ssl=True)
 
                    # result = translate.translate_text(Text=currentSentence,
                    #                                   SourceLanguageCode="en", TargetLanguageCode="fr-CA")
                    # print('TranslatedText: ' + result.get('TranslatedText'))
 
                    # print('SourceLanguageCode: ' + result.get('SourceLanguageCode'))
                    # print('TargetLanguageCode: ' + result.get('TargetLanguageCode'))
 
                    dictObj = {}
 
                    dictObj['startTime'] = currentStart
                    dictObj['endTime'] = currentEnd
                    dictObj['text'] = currentSentence
                    dictObj['TranslatedText'] = ""
                    # result.get('TranslatedText')
 
                    sentences.append(dictObj)
 
                    currentStart = -1
                    currentEnd = -1
                    currentSentence = ""
            else:
                currentTime = obj['end_time']
 
        fileData['sentences'] = sentences
 
        allDict.append(fileData)
 
 
l.sort(reverse=True)
 
print(l)
 
print("No. of jsons with accuracy with 'accuracy' > '85%' out of 1000 " + fileCount)
 
# allDict = sorted(allDict, key=lambda x: x['accuracy'], reverse=True)
 
# print(allDict)
 
 
# jsonData = json.dumps(allDict)
 
# with open('outputList.json', 'w') as f:
#     json.dump(jsonData, f)
src/CCTranslationsPOC/transcriptsToSentencesJsons.py
Lines 1 to 128 (Context lines: 5, 20, 100)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import json
import boto3
import requests
import json
import csv
 
path_to_json = 'transcripts2/'
 
l = []
allDict = []
 
 
check = {}
 
with open("./3b1f7bc4-49b4-4855-9cf8-a81c093573fb.csv", 'r') as file:
    csvreader = csv.reader(file)
    requestId = ''
    contentId = ''
    i = 0
    for row in csvreader:
        i = i + 1
        if i == 1:
            continue
 
        print(row[6].split('/')[-1].split('.')[0])
        requestId = row[6].split('/')[-1].split('.')[0]
        contentId = row[0]
        print(contentId)
        check[requestId] = contentId
 
 
for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]:
    with open(path_to_json + file_name) as json_file:
        fileData = {}
        data = json.load(json_file)
        print(file_name)
        fileData['name'] = file_name
        n = 0
        accuracy = 0
        # print(data['results']['transcripts'][0]['transcript'])
        # print(data['results']['items'][0]['start_time'])
        for obj in data['results']['items']:
            if obj['type'] != 'punctuation':
                n = n + 1
            accuracy += float(obj['alternatives'][0]['confidence'])
 
        accuracy = accuracy / n
 
        print(accuracy)
 
        fileData['accuracy'] = accuracy
 
        fileData['contentId'] = check[file_name.split('.')[0]]
 
        l.append(accuracy)
 
        currentStart = -1
        currentEnd = -1
        currentSentence = ""
        currentTime = -1
 
        sentences = []
 
        for obj in data['results']['items']:
 
            if currentStart == -1:
                currentStart = obj['start_time']
 
            word = obj['alternatives'][0]['content']
 
            if obj['type'] != 'punctuation':
                currentSentence += ' '
 
            currentSentence += word
 
            if obj['type'] == 'punctuation':
                if word == '.' or word == '?':
                    currentEnd = currentTime
                    #  add object to list
 
                    # print(currentSentence + " \n" + "Start : " +
                    #       currentStart + " End : " + currentEnd)
 
                    # translate = boto3.client(
                    #     service_name='translate', region_name='us-west-2', use_ssl=True)
 
                    # result = translate.translate_text(Text=currentSentence,
                    #                                   SourceLanguageCode="en", TargetLanguageCode="fr-CA")
                    # print('TranslatedText: ' + result.get('TranslatedText'))
 
                    # print('SourceLanguageCode: ' + result.get('SourceLanguageCode'))
                    # print('TargetLanguageCode: ' + result.get('TargetLanguageCode'))
 
                    dictObj = {}
 
                    dictObj['startTime'] = currentStart
                    dictObj['endTime'] = currentEnd
                    dictObj['text'] = currentSentence
                    dictObj['TranslatedText'] = ""
                    # result.get('TranslatedText')
 
                    sentences.append(dictObj)
 
                    currentStart = -1
                    currentEnd = -1
                    currentSentence = ""
            else:
                currentTime = obj['end_time']
 
        fileData['sentences'] = sentences
 
        allDict.append(fileData)
 
 
l.sort(reverse=True)
 
print(l)
 
allDict = sorted(allDict, key=lambda x: x['accuracy'], reverse=True)
 
print(allDict)
 
 
jsonData = json.dumps(allDict)
 
with open('outputList.json', 'w') as f:
    json.dump(jsonData, f)