Untitled
unknown
plain_text
a year ago
7.3 kB
24
Indexable
Never
''' Suppose we have an unsorted log file of accesses to web resources. Each log entry consists of an access time, the ID of the user making the access, and the resource ID. Write a function that takes the logs as input, builds the transition graph and returns it as an adjacency list with probabilities. Add __START__ and __END__ states. Specifically, for each resource, we want to compute a list of every possible next step taken by any user, together with the corresponding probabilities. The list of resources should include __START__ but not __END__, since by definition __END__ is a terminal state. Expected output for logs1: ["accesstime", "ID", "resourceID"] logs1 = [ ["200", "user_1", "resource_5"], ["3", "user_1", "resource_1"], ["620", "user_1", "resource_1"], ["620", "user_3", "resource_1"], ["34", "user_6", "resource_2"], ["95", "user_9", "resource_1"], ["416", "user_6", "resource_1"], ["58523", "user_3", "resource_1"], ["53760", "user_3", "resource_3"], ["58522", "user_22", "resource_1"], ["100", "user_3", "resource_6"], ["400", "user_6", "resource_2"], ] transition_graph(logs1) # => { '_START_': {'resource_1': 0.6, 'resource_2': 0.2, 'resource_6': 0.2}, 'resource_1': {'_END_': 0.71, 'resource_3': 0.14, 'resource_5': 0.14}, 'resource_2': {'resource_1': 0.5, 'resource_2': 0.5}, 'resource_3': {'resource_1': 1.0}, 'resource_5': {'resource_1': 1.0}, 'resource_6': {'resource_1': 1.0} } { 'user_1': ['resource_1', 'resource_5', 'resource_1'], 'user_3': ['resource_6', 'resource_1', 'resource_3', 'resource_1'], 'user_6': ['resource_2', 'resource_2', 'resource_1'], 'user_9': ['resource_1'], 'user_22': ['resource_1'], } #probability: For example, of 5 total users, 3 users have resource_1 as a first visit (user_1, user_9, user_22), 1 user has resource_6 as a first visit (user_3), and 1 user has resource_2 as a first visit (user_6), so the possible next steps for __START__ are resource_1 with probability 3/5, resource_2 with probability 1/5, and resource_6 with probability 1/5. logs1 = [ ["200", "user_1", "resource_5"], ["3", "user_1", "resource_1"], ["620", "user_1", "resource_1"], ["620", "user_3", "resource_1"], ["34", "user_6", "resource_2"], ["95", "user_9", "resource_1"], ["416", "user_6", "resource_1"], ["58523", "user_3", "resource_1"], ["53760", "user_3", "resource_3"], ["58522", "user_22", "resource_1"], ["100", "user_3", "resource_6"], ["400", "user_6", "resource_2"], ] These are the resource paths per user for the first logs example, ordered by access time: { 'user_1': ['resource_1', 'resource_5', 'resource_1'], 'user_3': ['resource_6', 'resource_1', 'resource_3', 'resource_1'], 'user_6': ['resource_2', 'resource_2', 'resource_1'], 'user_9': ['resource_1'], 'user_22': ['resource_1'], } Expected output for logs2: transition_graph(logs2) # => { '_START_': {'resource_2': 1.0}, 'resource_1': {'resource_2': 0.5, 'resource_3': 0.5}, 'resource_2': {'_END_': 0.5, 'resource_3': 0.5}, 'resource_3': {'resource_1': 1.0}} Expected output for logs3: transition_graph(logs3) # => { '__START__': {'resource_5': 1.0}, 'resource_5': {'__END__': 1.0} } Expected output for logs4: transition_graph(logs4) # => { '_START_': {'resource_5': 1.0}, 'resource_5': {'_END_': 0.6, 'resource_5': 0.2, 'resource_7': 0.2}, 'resource_7': {'_END_': 1.0} } Expected output for logs5: transition_graph(logs5) # => { '_START_': {'resource_3': 1.0}, 'resource_3': {'_END_': 0.14, 'resource_3': 0.86} } All Test Cases: transition_graph(logs1) transition_graph(logs2) transition_graph(logs3) transition_graph(logs4) transition_graph(logs5) Complexity analysis variables: n: number of logs in the input ''' # completeness solution > complexity def transition_graph(logs): # logs (list of list) -> resource_path (dict user:path) -> transition_graph(dest: probability of next) # sort the logs sorted(log) nlogn logs = [[int(x[0]), x[1], x[2]] for x in logs] logs = sorted(logs) # res_graph graph = {} # loop for the logs for log in logs: time, user, res = log[0], log[1], log[2] if user not in graph: graph[user] = [res] else: graph[user].append([res]) # access the access_time, user, res # if not in res_path: # define # append(res) start = '_START_' trans_graph = { '_START_': {"count": 0}} def traverse(resources, trans_graph): prev = None for res in resources: print("RESULT", res) res = res[0] # dealing with start if prev is None: if res not in trans_graph[start]: trans_graph[start][res] = 1 else: trans_graph[start][res] += 1 else: if prev not in trans_graph: trans_graph[prev] = {res: 1, 'count': 1} else: trans_graph[prev][res] += 1 trans_graph[prev]['count'] = 1 prev = res end = "_END_" if end not in trans_graph[prev]: trans_graph[prev][end] = 1 else: trans_graph[prev][end] += 1 # end= # call traverse for user in graph: res = graph[user][0] traverse(graph[user], trans_graph) # probabilitis for res in trans_graph: count = trans_graph[res]['count'] for next_ in trans_graph[res]: trans_graph[res][next_] /= count # delte del trans_graph[res]['count'] return trans_graph logs1 = [ ["200", "user_1", "resource_5"], ["3", "user_1", "resource_1"], ["620", "user_1", "resource_1"], ["620", "user_3", "resource_1"], ["34", "user_6", "resource_2"], ["95", "user_9", "resource_1"], ["416", "user_6", "resource_1"], ["58523", "user_3", "resource_1"], ["53760", "user_3", "resource_3"], ["58522", "user_22", "resource_1"], ["100", "user_3", "resource_6"], ["400", "user_6", "resource_2"], ] logs2 = [ ["357", "user", "resource_2"], ["1262", "user", "resource_1"], ["1462", "user", "resource_2"], ["1060", "user", "resource_1"], ["756", "user", "resource_3"], ["1090", "user", "resource_3"], ] logs3 = [ ["300", "user_10", "resource_5"], ] logs4 = [ ["1", "user_96", "resource_5"], ["1", "user_10", "resource_5"], ["301", "user_11", "resource_5"], ["301", "user_12", "resource_5"], ["603", "user_12", "resource_5"], ["1603", "user_12", "resource_7"], ] logs5 = [ ["300", "user_1", "resource_3"], ["599", "user_1", "resource_3"], ["900", "user_1", "resource_3"], ["1199", "user_1", "resource_3"], ["1200", "user_1", "resource_3"], ["1201", "user_1", "resource_3"], ["1202", "user_1", "resource_3"] ] transition_graph(logs1)