Source code for lib.analysis.network

import sys
from os import path
sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
import re
import networkx as nx
from networkx.algorithms.components.connected import connected_components
from datetime import date
import lib.util as util
import csv
import numpy as np
import lib.config as config
from itertools import izip_longest as zip_longest
import lib.analysis.user as user


[docs]def message_number_graph(log_dict, nicks, nick_same_list, DAY_BY_DAY_ANALYSIS=False): """ Creates a directed graph with each node representing an IRC user and each directed edge has a weight which mentions the number messages sent and recieved by that user in the selected time frame. Args: log_dict (dict): with key as dateTime.date object and value as {"data":datalist,"channel_name":channels name} nicks(list): list of all the nicks nick_same_list(list): list of lists mentioning nicks which belong to same users Returns: message_number_graph (nx graph object) """ message_number_day_list = [] conversations=[[0] for i in range(config.MAX_EXPECTED_DIFF_NICKS)] aggregate_message_number_graph = nx.DiGraph() #graph with multiple directed edges between clients used G = util.to_graph(nick_same_list) conn_comp_list = list(connected_components(G)) conn_comp_list = util.create_connected_nick_list(conn_comp_list) def msg_no_analysis_helper(rec_list, corrected_nick, nick, conn_comp_list,conversations,today_conversation): for receiver in rec_list: if(receiver == nick): if(corrected_nick != nick): nick_receiver = '' nick_receiver = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, nick, conn_comp_list, nick_receiver) if DAY_BY_DAY_ANALYSIS: today_conversation = util.extend_conversation_list(nick_sender, nick_receiver, today_conversation) else: conversations = util.extend_conversation_list(nick_sender, nick_receiver, conversations) def message_no_add_egde(message_graph, conversation): for index in xrange(config.MAX_EXPECTED_DIFF_NICKS): if(len(conversation[index]) == 3 and conversation[index][0] >= config.THRESHOLD_MESSAGE_NUMBER_GRAPH): if len(conversation[index][1]) >= config.MINIMUM_NICK_LENGTH and len(conversation[index][2]) >= config.MINIMUM_NICK_LENGTH: message_graph.add_edge(util.get_nick_representative(nicks, nick_same_list, conversation[index][1]), util.get_nick_representative(nicks, nick_same_list, conversation[index][2]), weight=conversation[index][0]) return message_graph for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] today_conversation = [[0] for i in range(config.MAX_EXPECTED_DIFF_NICKS)] for line in day_log: flag_comma = 0 if(util.check_if_msg_line (line)): parsed_nick = re.search(r"\<(.*?)\>", line) corrected_nick = util.correctLastCharCR(parsed_nick.group(0)[1:-1]) nick_sender = "" nick_receiver = "" nick_sender = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, corrected_nick, conn_comp_list, nick_sender) for nick in nicks: rec_list = [e.strip() for e in line.split(':')] rec_list = util.rec_list_splice(rec_list) if not rec_list[1]: break rec_list = util.correct_last_char_list(rec_list) msg_no_analysis_helper(rec_list, corrected_nick, nick, conn_comp_list, conversations,today_conversation) if "," in rec_list[1]: flag_comma = 1 rec_list_2=[e.strip() for e in rec_list[1].split(',')] for i in xrange(0,len(rec_list_2)): if(rec_list_2[i]): rec_list_2[i] = util.correctLastCharCR(rec_list_2[i]) msg_no_analysis_helper(rec_list_2, corrected_nick, nick, conn_comp_list, conversations, today_conversation) if(flag_comma == 0): rec = line[line.find(">")+1:line.find(", ")] rec = rec[1:] rec = util.correctLastCharCR(rec) if(rec == nick): if(corrected_nick != nick): nick_receiver = nick_receiver_from_conn_comp(nick, conn_comp_list) if DAY_BY_DAY_ANALYSIS: today_message_number_graph = nx.DiGraph() today_message_number_graph = message_no_add_egde(today_message_number_graph, today_conversation) year, month, day = util.get_year_month_day(day_content) message_number_day_list.append([today_message_number_graph, year+'-'+month+'-'+day]) print "\nBuilding graph object with EDGE WEIGHT THRESHOLD:", config.THRESHOLD_MESSAGE_NUMBER_GRAPH if not DAY_BY_DAY_ANALYSIS: aggregate_message_number_graph = message_no_add_egde(aggregate_message_number_graph, conversations) if config.DEBUGGER: print "========> 30 on " + str(len(conversations)) + " conversations" print conversations[:30] if DAY_BY_DAY_ANALYSIS: return message_number_day_list else: return aggregate_message_number_graph
[docs]def channel_user_presence_graph_and_csv(nicks, nick_same_list, channels_for_user, nick_channel_dict, nicks_hash, channels_hash): """ creates a directed graph for each nick, each edge from which points to the IRC Channels that nick has participated in. (Nick changes are tracked here and only the initial nick is shown if a user changed his nick) Args: nicks(list): list of all the nicks nick_same_list(list): list of lists mentioning nicks which belong to same users channels_for_user(dict): dictionary with keys as nicks and value as list of channels on which user with nick is present nick_channel_dict(dict): channels and nicks present on them nicks_hash(list): hash values of nicks channels_hash(list): hash values of channels Returns: presence_graph_and_matrix (dict): contains adjacency matrices and graphs for Acc Auu Acu full_presence_graph (nx graph object) """ presence_graph_and_matrix = { "CC": { "graph": None, "matrix": None, "reducedMatrix": None, "reducedGraph": None }, "CU": { "graph": None, "matrix": None, "reducedMatrix": None, "reducedGraph": None }, "UU": { "graph": None, "matrix": None, "reducedMatrix": None, "reducedGraph": None }, } users_on_channel = {} full_presence_graph = nx.DiGraph() #directed def create_adj_matrix(hashed_list1, hashed_list2): adj_matrix = [[0] * len(hashed_list1) for i in range(len(hashed_list2))] return adj_matrix def add_channel_weighted_edge(graph, adjlist, nicks_hash, channels_hash, channel): graph.add_edge(nicks_hash.index(adjlist['nickname']), (config.STARTING_HASH_CHANNEL + channels_hash.index(channel[0])), weight=channel[1]) return graph #====================== CHANNEL_USER ============================ channel_user_graph = nx.Graph() CU_adjacency_matrix = create_adj_matrix(nicks_hash, channels_hash) for adjlist in nick_channel_dict: for channel in adjlist['channels']: if channel[1] > config.FILTER_FOR_CHANNEL_USER_GRAPH: # print str(nicks_hash.index(adjlist['nickname']))+"\t"+str(config.STARTING_HASH_CHANNEL +channels_hash.index(channel[0])) channel_user_graph = add_channel_weighted_edge(channel_user_graph, adjlist, nicks_hash, channels_hash, channel) full_presence_graph = add_channel_weighted_edge(full_presence_graph, adjlist, nicks_hash, channels_hash, channel) # print nicks_hash.index(adjlist['nickname']),adjlist['nickname'], channels_hash.index(channel[0]),channel[0], channel[1] CU_adjacency_matrix[channels_hash.index(channel[0])][nicks_hash.index(adjlist['nickname'])] = channel[1] # print adjlist['nickname'], channel[0] if users_on_channel.has_key(channel[0]): if adjlist['nickname'] not in users_on_channel[channel[0]]: users_on_channel[channel[0]].append(adjlist['nickname']) else: users_on_channel[channel[0]] = [adjlist['nickname']] presence_graph_and_matrix["CU"]["matrix"] = CU_adjacency_matrix presence_graph_and_matrix["CU"]["graph"] = channel_user_graph print "CU Adjacency Matrix Generated" #====================== CHANNEL_CHANNEL ============================ channel_channel_graph = nx.Graph() CC_adjacency_matrix = create_adj_matrix(channels_hash, channels_hash) def add_common_users_weighted_edge(graph, index1, index2, common_users): graph.add_edge(str(config.STARTING_HASH_CHANNEL + index1), str(config.STARTING_HASH_CHANNEL + index2), weight=len(common_users)) return graph for i in xrange(0, len(channels_hash)): for j in xrange(i+1, len(channels_hash)): common_users = list(set(users_on_channel[channels_hash[i]]) & set(users_on_channel[channels_hash[j]])) # print users_on_channel.keys()[i], users_on_channel.keys()[j], common_users CC_adjacency_matrix[i][j] = len(common_users) CC_adjacency_matrix[j][i] = len(common_users) if len(common_users) > config.FILTER_FOR_CHANNEL_CHANNEL_GRAPH: full_presence_graph = add_common_users_weighted_edge(full_presence_graph, i, j, common_users) full_presence_graph = add_common_users_weighted_edge(full_presence_graph, j, i, common_users) '''full_presence_graph.add_edge(str(config.STARTING_HASH_CHANNEL + i), str(config.STARTING_HASH_CHANNEL + j), weight=len(common_users)) full_presence_graph.add_edge(str(config.STARTING_HASH_CHANNEL + j), str(config.STARTING_HASH_CHANNEL + i), weight=len(common_users))''' # "Uncomment for directed version" # print str(channels_hash.index(users_on_channel.keys()[i]))+"\t"+str(channels_hash.index(users_on_channel.keys()[j])) # print str(channels_hash.index(users_on_channel.keys()[j]))+"\t"+str(channels_hash.index(users_on_channel.keys()[i])) # print channels_hash[i],channels_hash[j] channel_channel_graph = add_common_users_weighted_edge(channel_channel_graph, i , j, common_users) '''channel_channel_graph.add_edge(str(config.STARTING_HASH_CHANNEL + i), str(config.STARTING_HASH_CHANNEL + j), weight=len(common_users))''' presence_graph_and_matrix["CC"]["matrix"] = CC_adjacency_matrix presence_graph_and_matrix["CC"]["graph"] = channel_channel_graph print "CC Adjacency Matrix Generated" #====================== USER_USER ============================ user_user_graph = nx.Graph() UU_adjacency_matrix = create_adj_matrix(nicks_hash, nicks_hash) for user_channel_dict in channels_for_user: # user_channel_dict format : {nick : [channels on that day], } for i in xrange(0, len(user_channel_dict.keys())): for j in xrange(i+1, len(user_channel_dict.keys())): common_channels_on_that_day = list(set(user_channel_dict[user_channel_dict.keys()[i]]) & set(user_channel_dict[user_channel_dict.keys()[j]])) # print user_channel_dict.keys()[i], user_channel_dict.keys()[j], common_channels_on_that_day user1 = user_channel_dict.keys()[i] user2 = user_channel_dict.keys()[j] no_of_common_channels_day = len(common_channels_on_that_day) #print str(nicks_hash.index(user1))+"\t"+str(nicks_hash.index(user2)) # "Uncomment for directed version" # print str(nicks_hash.index(user2))+"\t"+str(nicks_hash.index(user1)) # print user1, user2 UU_adjacency_matrix[nicks_hash.index(user1)][nicks_hash.index(user2)] += no_of_common_channels_day UU_adjacency_matrix[nicks_hash.index(user2)][nicks_hash.index(user1)] += no_of_common_channels_day for i in range(len(nicks_hash)): for j in range(i): if UU_adjacency_matrix[i][j] > config.FILTER_FOR_USER_USER_GRAPH: # print str(i)+'\t'+str(j) user_user_graph.add_edge(i, j, weight=UU_adjacency_matrix[i][j]) full_presence_graph.add_edge(i, j, weight=UU_adjacency_matrix[i][j]) full_presence_graph.add_edge(j, i, weight=UU_adjacency_matrix[i][j]) presence_graph_and_matrix["UU"]["matrix"] = UU_adjacency_matrix presence_graph_and_matrix["UU"]["graph"] = user_user_graph print "UU Adjacency Matrix Generated" def print_node_degree(nodes, max_degree_possible): for i in range(max_degree_possible): print "deg"+str(i)+'\t'+str(nodes[i]) degree_map = {"out": full_presence_graph.out_degree().values(), "in": full_presence_graph.in_degree().values(), "all": full_presence_graph.degree().values()} def inc_degree(degree_list, nodes, max_degree_possible): for degree in degree_list: if not degree < max_degree_possible: print "===error", degree else: nodes[degree] += 1 return nodes #========================================================================= if config.GENERATE_DEGREE_ANAL: max_degree_possible = config.CHANNEL_USER_MAX_DEG nodes_with_OUT_degree = [0]*max_degree_possible nodes_with_IN_degree = [0]*max_degree_possible nodes_with_TOTAL_degree = [0]*max_degree_possible # print full_presence_graph.out_degree().values() nodes_with_OUT_degree = inc_degree(degree_map["out"], nodes_with_OUT_degree, max_degree_possible) nodes_with_IN_degree = inc_degree(degree_map["in"], nodes_with_IN_degree, max_degree_possible) nodes_with_TOTAL_degree = inc_degree(degree_map["all"], nodes_with_TOTAL_degree, max_degree_possible) print "========= OUT DEGREE =======" print_node_degree(nodes_with_OUT_degree, max_degree_possible) print "========= IN DEGREE =======" print_node_degree(nodes_with_IN_degree, max_degree_possible) print "========= TOTAL DEGREE =======" print_node_degree(nodes_with_TOTAL_degree, max_degree_possible) #========================================================================= ''' We have around 20k users and most of them just visit a channel once, hence we filter out the top 100 users This inturns reduces the CU and UU matrices ''' ''' calculate top <how_many_top_users> users this is achieved by taking top users from CU matrix on the basis of the column sum (total number of days active on a channel) # ''' how_many_top_users = config.FILTER_TOP_USERS ''' we also need to filter the channels and are filtered on the basis of row sum of CC matrix ''' how_many_top_channels = config.FILTER_TOP_CHANNELS sum_for_each_channel = [] for channel_row in CC_adjacency_matrix: sum_for_each_channel.append(sum(channel_row)) def get_top_indices(sum_list, how_many_vals): return sorted(range(len(sum_list)), key=lambda i: sum_list[i], reverse=True)[:how_many_vals] def get_indices_to_delete(hash_list, top_indices): return list(set([i for i in range(len(hash_list))]) - set(top_indices)) #filter out top <how_many_top_channels> indices top_indices_channels = get_top_indices(sum_for_each_channel, how_many_top_channels) indices_to_delete_channels = get_indices_to_delete(channels_hash, top_indices_channels) temp_channels = np.delete(CC_adjacency_matrix, indices_to_delete_channels, 1) #delete columns reduced_CC_adjacency_matrix = np.delete(temp_channels, indices_to_delete_channels, 0) #delete rows presence_graph_and_matrix["CC"]["reducedMatrix"] = reduced_CC_adjacency_matrix reduced_CC_graph = channel_channel_graph.copy() reduced_CC_graph.remove_nodes_from(map(str, np.array(indices_to_delete_channels) + config.STARTING_HASH_CHANNEL)) # say the indices to remove are 1,2 presence_graph_and_matrix["CC"]["reducedGraph"] = reduced_CC_graph presence_graph_and_matrix["CC"]["reducedGraph"] = reduced_CC_graph print "Generated Reduced CC Adjacency Matrix" #to calculate sum first take the transpose of CU matrix so users in row UC_adjacency_matrix = zip(*CU_adjacency_matrix) sum_for_each_user = [] for user_row in UC_adjacency_matrix: sum_for_each_user.append(sum(user_row)) #filter out top <how_many_top_users> indices top_indices_users = get_top_indices(sum_for_each_user, how_many_top_users) indices_to_delete_users = get_indices_to_delete(nicks_hash, top_indices_users) # print len(top_indices_users), top_indices_users # print len(indices_to_delete_users), indices_to_delete_users #update the nick_hash, channel_hash reduced_nick_hash = np.delete(nicks_hash, indices_to_delete_users) reduced_channel_hash = np.delete(channels_hash, indices_to_delete_channels) #update the CU matrix by deleting particular columns, and rows which are not in top_indices_users, channels temp_user_channel = np.delete(CU_adjacency_matrix, indices_to_delete_users, 1) #delete columns reduced_CU_adjacency_matrix = np.delete(temp_user_channel, indices_to_delete_channels, 0) #delete rows reduced_CU_graph = channel_user_graph.copy() reduced_CU_graph.remove_nodes_from(np.array(indices_to_delete_channels) + config.STARTING_HASH_CHANNEL) #remove non top channels_ reduced_CU_graph.remove_nodes_from(np.array(indices_to_delete_users)) #remove users presence_graph_and_matrix["CU"]["reducedGraph"] = reduced_CU_graph print "Generated Reduced CU Adjacency Matrix" presence_graph_and_matrix["CU"]["reducedMatrix"] = reduced_CU_adjacency_matrix #update the UU matrix by deleting both columns and rows temp_users = np.delete(UU_adjacency_matrix, indices_to_delete_users, 1) #delete columns reduced_UU_adjacency_matrix = np.delete(temp_users, indices_to_delete_users, 0) #delete rows reduced_UU_graph = user_user_graph.copy() reduced_UU_graph.remove_nodes_from(np.array(indices_to_delete_users)) presence_graph_and_matrix["UU"]["reducedGraph"] = reduced_UU_graph print "Generated Reduced UU Adjacency Matrix" presence_graph_and_matrix["UU"]["reducedMatrix"] = reduced_UU_adjacency_matrix if config.PRINT_CHANNEL_USER_HASH: print "==================================================" print "========= REDUCED NICK HASH =========" for i in range(len(reduced_nick_hash)): print str(i)+"\t"+reduced_nick_hash[i] print "========= REDUCED CHANNEL HASH =========" for i in range(len(reduced_channel_hash)): print str(config.STARTING_HASH_CHANNEL + i)+"\t"+reduced_channel_hash[i] return presence_graph_and_matrix, full_presence_graph
[docs]def filter_edge_list(edgelist_file_loc, max_hash, how_many_top): """ reduces the edge list by selecting top nodes through degree analysis Arguments: edgelist_file_loc (str): location of the edgelist file max_hash (int): max possinle value of the node_hash in edgelist how_many_top (int): how many top nodes to select in the new edgeList Returns: null """ node_list = [] degrees = [0] * max_hash with open(edgelist_file_loc) as f: content = f.readlines() for line in content: a, b = line.split() node_list.append(int(a)) node_list.append(int(b)) degrees[int(a)] += 1 degrees[int(b)] += 1 print "Done Pre Computation" print "Max_hash", max(node_list) max_hash = max(node_list) degrees = np.array(degrees) print "========TOP "+str(how_many_top)+" NODES ON BASIS OF DEGREE ========" top_nodes = list(degrees.argsort()[::-1])[:how_many_top] # print top_nodes print "======= UPDATED ADJACENY LIST ON THE BASIS OF ABOVE NODES =======" with open(edgelist_file_loc) as f: content = f.readlines() for line in content: a, b = map(int, line.split()) if a in top_nodes and b in top_nodes: print str(a) + "\t" + str(b)
[docs]def degree_analysis_on_graph(nx_graph, date=None, directed = True): """ perform degree analysis of input graph object Arguments: nx_graph (nx_object): object to perform analysis on date(string): timestamp directed(boolean): True if nx_graph is directed else False Returns: dictionary: with in_degree, out_degree & total_degree for directed graphs and degree as key for undirected_graphs """ def nodes_with_degree_populator(degree_values, label): nodes_with_degree = [] if len(degree_values): nodes_with_degree = [[label + str(i), 0, ''] for i in xrange((max(degree_values)+1))] else: nodes_with_degree = [["NA", 0, "NA"]] for degree in degree_values: nodes_with_degree[degree][1] += 1 return nodes_with_degree def give_userlist_where_degree_helper(degree_dict, degree): key_list = "" for key in degree_dict: if degree_dict[key] == degree: key_list += (str(key) + ", ") return key_list degree_map = {} # will map a string(eg "out", "in" , "all") to nx_graph.out_degree() etc def raw_node_append(nodes, raw, degree_type): """ Args: nodes(List) : nodes_with_OUT/IN/TOTAL degree\ raw(List) : raw_in/out/total degree_type(str) : "in" "out" or "all" , basically keys of degree_map Returns: raw(List) nodes(List) """ for i in range(1, len(nodes)): raw.append(nodes[i][1]) # raw will store the number of nodes with degree 0 in position 1, # of nodes with degree 1 in position 2 etc nodes[i][2] = give_userlist_where_degree_helper(degree_map[degree_type], i - 1) return raw, nodes if directed: nodes_with_OUT_degree = nodes_with_degree_populator(nx_graph.out_degree().values(), "nodes_w_out_deg") nodes_with_IN_degree = nodes_with_degree_populator(nx_graph.in_degree().values(), "nodes_w_in_deg") nodes_with_TOTAL_degree = nodes_with_degree_populator(nx_graph.degree().values(), "nodes_w_deg") nodes_with_OUT_degree.insert(0, ["total_nodes", sum(data[1] for data in nodes_with_OUT_degree)]) # sum of (number of nodes with degree 1 +number of nodes with degre 2..) nodes_with_IN_degree.insert(0, ["total_nodes", sum(data[1] for data in nodes_with_IN_degree)]) nodes_with_TOTAL_degree.insert(0, ["total_nodes", sum(data[1] for data in nodes_with_TOTAL_degree)]) raw_out = [str(date)] raw_in = [str(date)] raw_total = [str(date)] degree_map = {"out":nx_graph.out_degree(),"in":nx_graph.in_degree(),"all":nx_graph.degree()} raw_out, nodes_with_OUT_degree = raw_node_append(nodes_with_OUT_degree, raw_out, "out") raw_in, nodes_with_IN_degree = raw_node_append(nodes_with_IN_degree, raw_in, "in") raw_total, nodes_with_TOTAL_degree = raw_node_append(nodes_with_TOTAL_degree, raw_total, "all") return { "out_degree": { "formatted_for_csv": nodes_with_OUT_degree, "raw_for_vis": raw_out }, "in_degree": { "formatted_for_csv": nodes_with_IN_degree, "raw_for_vis": raw_in }, "total_degree": { "formatted_for_csv": nodes_with_TOTAL_degree, "raw_for_vis": raw_total } } # for undirected else: nodes_with_degree_undirected = nodes_with_degree_populator(nx_graph.degree().values(), "nodes_w_deg") nodes_with_degree_undirected.insert(0, ["total_nodes", sum(data[1] for data in nodes_with_degree_undirected)]) raw_degree = [str(date)] degree_map = {"all":nx_graph.degree()} raw_degree, nodes_with_degree_undirected = raw_node_append(nodes_with_degree_undirected, raw_degree, "all") return { "degree":{ "formatted_for_csv" : nodes_with_degree_undirected, "raw_for_vis" : raw_degree } }
[docs]def message_time_graph(log_dict, nicks, nick_same_list, DAY_BY_DAY_ANALYSIS=False): """ creates a directed graph where each edge denotes a message sent from a user to another user with the stamp denoting the time at which the message was sent Args: log_dict (dictionary): Dictionary of logs data created using reader.py nicks(List) : List of nickname created using nickTracker.py nick_same_list(List) :List of same_nick names created using nickTracker.py DAY_BY_DAY_ANALYSIS: True if graphs are produced for each day Returns: msg_time_graph_list(List): List of message time graphs for different days msg_time_aggr_graph: aggregate message time graph where edges are date + time when sender sends a message to receiver """ msg_time_graph_list = [] msg_time_aggr_graph = nx.MultiDiGraph() G = util.to_graph(nick_same_list) conn_comp_list = list(connected_components(G)) def compare_spliced_nick(nick_to_compare, spliced_nick, nick_name, line): if(nick_to_compare == nick_name): if(spliced_nick != nick_name): nick_receiver = nick_receiver_from_conn_comp(nick_name, conn_comp_list) util.build_graphs(nick_sender, nick_receiver, line[1:6], year, month, day, graph_conversation, msg_time_aggr_graph) conn_comp_list = util.create_connected_nick_list(conn_comp_list) for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] year, month, day = util.get_year_month_day(day_content) graph_conversation = nx.MultiDiGraph() #graph with multiple directed edges between clients used for line in day_log: flag_comma = 0 if(util.check_if_msg_line (line)): m = re.search(r"\<(.*?)\>", line) spliced_nick = util.correctLastCharCR(m.group(0)[1:-1]) nick_sender = "" nick_sender = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, spliced_nick, conn_comp_list, nick_sender) for nick_name in nicks: rec_list = [e.strip() for e in line.split(':')] #receiver list splited about : rec_list = util.rec_list_splice(rec_list) if not rec_list[1]: #index 0 will contain time 14:02 break rec_list = util.correct_last_char_list(rec_list) for nick_to_search in rec_list: if(nick_to_search == nick_name): if(spliced_nick != nick_name): nick_receiver = "" nick_receiver = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, nick_name, conn_comp_list, nick_receiver) util.build_graphs(nick_sender, nick_receiver, line[1:6], year, month, day, graph_conversation, msg_time_aggr_graph) if "," in rec_list[1]: #receiver list may of the form <Dhruv> Rohan, Ram : flag_comma = 1 rec_list_2 = [e.strip() for e in rec_list[1].split(',')] rec_list_2 = util.correct_last_char_list(rec_list_2) for nick_to_search in rec_list_2: compare_spliced_nick(nick_to_search, spliced_nick, nick_name, line) if(flag_comma == 0): #receiver list can be <Dhruv> Rohan, Hi! rec = line[line.find(">") + 1:line.find(", ")] rec = util.correctLastCharCR(rec[1:]) compare_spliced_nick(rec, spliced_nick, nick_name, line) msg_time_graph_list.append(graph_conversation) if DAY_BY_DAY_ANALYSIS: return msg_time_graph_list else: return msg_time_aggr_graph
[docs]def message_number_bins_csv(log_dict, nicks, nick_same_list): """ creates a CSV file which tracks the number of message exchanged in a channel for 48 bins of half an hour each distributed all over the day aggragated over the year. Args: log_dict (dictionary): Dictionary of logs data created using reader.py nicks(List) : List of nickname created using nickTracker.py nick_same_list(List) :List of same_nick names created using nickTracker.p Returns: bin_matrix(list of lists): a list of lists of 48 bins with number of messages sent in each bin tot_msgs: total messages exchanged """ no_of_bins = (config.HOURS_PER_DAY * config.MINS_PER_HOUR) / config.BIN_LENGTH_MINS tot_msgs = [0] * no_of_bins bin_matrix = [] def bin_increment(nick_name, messager, nick_spliced, bins, bin_index): if(nick_name == messager): if(nick_spliced != messager): bins[bin_index] = bins[bin_index] + 1 for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] bins = [0] * no_of_bins for line in day_log: if(line[0] == '['): time_in_min = int(line[1:3]) * 60 + int(line[4:6]) bin_index = time_in_min / config.BIN_LENGTH_MINS #gives the index of the bin for eg 01:35 in mins is 95 then 95/30 --> 3 , puts it in bin index 3 flag_comma = 0 # sometimes messages are sent to users like [22:55] <Rohan> Krishna, i hope we are on track. and sometimes like [22:56] <Krishna> Rohan: Yes it is. if(line[0] != '=' and "] <" in line and "> " in line): m = re.search(r"\<(.*?)\>", line) nick_spliced = util.correctLastCharCR(m.group(0)[1:-1]) for messager in nicks: rec_list = [e.strip() for e in line.split(':')] rec_list = util.rec_list_splice(rec_list) if not rec_list[1]: break rec_list = util.correct_last_char_list(rec_list) for nick_name in rec_list: bin_increment(nick_name, messager, nick_spliced, bins, bin_index) if "," in rec_list[1]: flag_comma = 1 rec_list = [e.strip() for e in rec_list[1].split(',')] rec_list = util.correct_last_char_list(rec_list) for nick_name in rec_list: bin_increment(nick_name, messager, nick_spliced, bins, bin_index) if(flag_comma == 0): rec = line[line.find(">")+1:line.find(", ")][1:] rec = util.correctLastCharCR(rec) bin_increment(rec, messager, nick_spliced, bins, bin_index) bin_matrix.append(bins) tot_msgs = [tot_msgs[i] + bins[i] for i in range(len(bins))] return bin_matrix, sum(tot_msgs)
[docs]def degree_node_number_csv(log_dict, nicks, nick_same_list): """ creates two csv files having no. of nodes with a certain in and out-degree for number of nodes it interacted with, respectively. Also gives graphs for log(degree) vs log(no. of nodes) and tries to find it's equation by curve fitting Args: log_dict (dict): with key as dateTime.date object and value as {"data":datalist,"channel_name":channels name} nicks(list): list of all the nicks nick_same_list(list): list of lists mentioning nicks which belong to same users Returns: out_degree (list) in_degree (list) total_degree (list) """ msg_num_graph_day_list = message_number_graph(log_dict, nicks, nick_same_list, True) degree_analysis_day_list = [] for day_graph_list in msg_num_graph_day_list: day_graph = day_graph_list[0] degree_analysis_day_list.append(degree_analysis_on_graph(day_graph, day_graph_list[1])) out_degree = [] in_degree = [] total_degree = [] max_in_degree = 0 max_out_degree = 0 max_total_degree = 0 for degree_analysis in degree_analysis_day_list: out_degree.append(degree_analysis["out_degree"]["raw_for_vis"]) in_degree.append(degree_analysis["in_degree"]["raw_for_vis"]) total_degree.append(degree_analysis["total_degree"]["raw_for_vis"]) max_out_degree = max(max_out_degree, len(degree_analysis["out_degree"]["raw_for_vis"])) max_in_degree = max(max_in_degree, len(degree_analysis["in_degree"]["raw_for_vis"])) max_total_degree = max(max_total_degree, len(degree_analysis["total_degree"]["raw_for_vis"])) def format_degree_list(degree_list, max_range, degree_type): degree_head_row = ["deg"+str(i) for i in range(max_range)] degree_head_row.insert(0, degree_type) degree_list.insert(0, degree_head_row) degree_list = list(zip_longest(*degree_list, fillvalue=0)) return degree_list out_degree = format_degree_list(out_degree, max_out_degree, "out_degree") in_degree = format_degree_list(in_degree, max_in_degree, "in_degree") total_degree = format_degree_list(total_degree, max_total_degree, "total_degree") return out_degree, in_degree, total_degree
[docs]def nick_receiver_from_conn_comp(nick, conn_comp_list): """ creates nick_receiver from conn_comp_list, it is a helper function used in create_message_time_graph and message_number_graph """ nick_receiver = "" for i in range(config.MAX_EXPECTED_DIFF_NICKS): if nick in conn_comp_list[i]: nick_receiver = conn_comp_list[i][0] break return nick_receiver
[docs]def identify_hubs_and_experts(log_dict, nicks, nick_same_list): """ uses message_number graph to identify hubs and experts in the network Args: log_dict (dict): with key as dateTime.date object and value as {"data":datalist,"channel_name":channels name} nicks(list): list of all the nicks nick_same_list(list): list of lists mentioning nicks which belong to same users Returns: message_graph(nx graph): message number graph top_hub(list): list of top hubs top_keyword_overlap(list): top users from keywords digest top_auth: list of top authorities """ message_graph = message_number_graph(log_dict, nicks, nick_same_list) hubs, authority_values = nx.hits(message_graph) keyword_dict_list, user_keyword_freq_dict, user_words_dict_list, nicks_for_stop_words, keywords_for_channels = user.keywords(log_dict, nicks, nick_same_list) if config.DEBUGGER: print "========> USERS" print user_keyword_freq_dict print "========> CHANNELS" print keywords_for_channels, len(keywords_for_channels) top_keywords_for_channels = [] for word_tuple in keywords_for_channels[:config.NUMBER_OF_KEYWORDS_CHANNEL_FOR_OVERLAP]: top_keywords_for_channels.append(word_tuple[0]) overlap_word_number = [] for keyword_tuple in user_keyword_freq_dict: keywords_for_user = keyword_tuple['keywords'] username = keyword_tuple['nick'] overlapping_keywords = list(set(top_keywords_for_channels).intersection([x[0] for x in keywords_for_user])) if len(overlapping_keywords) > 0: overlap_word_number.append([username, len(overlapping_keywords)]) top_hubs_with_score = util.find_top_n_element_after_sorting(hubs.items(), 1, True, config.HOW_MANY_TOP_EXPERTS) top_auth_with_score = util.find_top_n_element_after_sorting(authority_values.items(), 1, True, config.HOW_MANY_TOP_EXPERTS) top_keyword_overlap_with_score = util.find_top_n_element_after_sorting(overlap_word_number, 1, True, config.HOW_MANY_TOP_EXPERTS) print "TOP " + str(config.HOW_MANY_TOP_EXPERTS) + " HUBS\n", top_hubs_with_score print "TOP " + str(config.HOW_MANY_TOP_EXPERTS) + " AUTH\n", top_auth_with_score print "TOP " + str(config.HOW_MANY_TOP_EXPERTS) + " KEYWORD OVERLAP\n", top_keyword_overlap_with_score top_hub = [hub_tuple[0] for hub_tuple in top_hubs_with_score] top_auth = [auth_tuple[0] for auth_tuple in top_auth_with_score] top_keyword_overlap = [key_overlap_tuple[0] for key_overlap_tuple in top_keyword_overlap_with_score] for node_name in message_graph: # mark EXPERTS message_graph.node[node_name]['style'] = 'filled' if node_name in top_auth and node_name in top_keyword_overlap: message_graph.node[node_name]['color'] = '#ff000' elif node_name in top_auth: message_graph.node[node_name]['color'] = '#00ff00' elif node_name in top_keyword_overlap: message_graph.node[node_name]['color'] = '#0000ff' else: message_graph.node[node_name]['color'] = '#cccccc' # mark HUBS if node_name in top_hub: message_graph.node[node_name]['shape'] = 'square' return message_graph, top_hub, top_keyword_overlap, top_auth