import pandas as pd import networkx as nx import numpy as np from collections import defaultdict import matplotlib.pyplot as plt
emails = pd.read_csv("./data/Emails.csv")
file = pd.read_csv("./data/Aliases.csv")
# key: alias value: personId aliases = {} for index, row in file.iterrows(): aliases[row['Alias']] = row['PersonId'] file = pd.read_csv("./data/Persons.csv")
# key: id value: name persons = {} for index, row in file.iterrows(): persons[row['Id']] = row['Name']
# transform the alias name to the same defunify_name(name): name = str(name).lower() name = name.replace(',','').split("@")[0] if name in aliases.keys(): return persons[aliases[name]] return name
defdraw_graph(graph): # graph is a direct graph object # set spring_layout positions = nx.spring_layout(graph) # set the size of the nodes. The PR value is higher, the node is larger. nodesize = [x['pagerank']*20000for v, x in graph.nodes(data=True)] # set the length of the edges edgesize = [np.sqrt(e[2]['weight']) for e in graph.edges(data = True)] # draw nodes nx.draw_networkx_nodes(graph, positions, node_size = nodesize, alpha = 0.4)
# normalize the from and to value. # the metadataFrom and metadatato is the columns's name emails.MetadataFrom = emails.MetadataFrom.apply(unify_name) emails.MetadataTo = emails.MetadataTo.apply(unify_name)
# set the weight is equal the sending times # { (from, to): num } edges_weights_temp = defaultdict(list) # the key can be a tuple for row in zip(emails.MetadataFrom, emails.MetadataTo, emails.RawText): temp = (row[0], row[1]) if temp notin edges_weights_temp: edges_weights_temp[temp] = 1 else: edges_weights_temp[temp] += 1
# transfer the format (from, to):value -> from, to, value edge_weights = [(key[0], key[1], val) for key, val in edges_weights_temp.items()]