Source code for lib.analysis.network

import sys
from os import path
sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
import re
import networkx as nx
from networkx.algorithms.components.connected import connected_components
from datetime import date
import lib.util as util
import csv
import numpy as np
import lib.config as config
from itertools import izip_longest as zip_longest
import lib.analysis.user as user


[docs]def message_number_graph(log_dict, nicks, nick_same_list, DAY_BY_DAY_ANALYSIS=False):
    """ 
    Creates a directed graph
    with each node representing an IRC user
    and each directed edge has a weight which 
    mentions the number messages sent and recieved by that user 
    in the selected time frame.
    
    Args:
        log_dict (dict): with key as dateTime.date object and value as {"data":datalist,"channel_name":channels name}
        nicks(list): list of all the nicks
        nick_same_list(list): list of lists mentioning nicks which belong to same users
    Returns:
       message_number_graph (nx graph object)
    """
    message_number_day_list = []
    conversations=[[0] for i in range(config.MAX_EXPECTED_DIFF_NICKS)]
    aggregate_message_number_graph = nx.DiGraph()  #graph with multiple directed edges between clients used

    G = util.to_graph(nick_same_list)
    conn_comp_list = list(connected_components(G))

    conn_comp_list = util.create_connected_nick_list(conn_comp_list)

    def msg_no_analysis_helper(rec_list, corrected_nick, nick, conn_comp_list,conversations,today_conversation):
        for receiver in rec_list:
            if(receiver == nick):
                if(corrected_nick != nick):                                 
                    nick_receiver = ''
                    nick_receiver = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, nick, conn_comp_list, nick_receiver)    

                    if DAY_BY_DAY_ANALYSIS:
                        today_conversation = util.extend_conversation_list(nick_sender, nick_receiver, today_conversation)
                    else:
                        conversations = util.extend_conversation_list(nick_sender, nick_receiver, conversations)

    def message_no_add_egde(message_graph, conversation):
        for index in xrange(config.MAX_EXPECTED_DIFF_NICKS):
            if(len(conversation[index]) == 3 and conversation[index][0] >= config.THRESHOLD_MESSAGE_NUMBER_GRAPH):
                if len(conversation[index][1]) >= config.MINIMUM_NICK_LENGTH and len(conversation[index][2]) >= config.MINIMUM_NICK_LENGTH:
                    message_graph.add_edge(util.get_nick_representative(nicks, nick_same_list, conversation[index][1]), util.get_nick_representative(nicks, nick_same_list, conversation[index][2]), weight=conversation[index][0])
        return message_graph


    for day_content_all_channels in log_dict.values():
        for day_content in day_content_all_channels:
            day_log = day_content["log_data"]
            today_conversation = [[0] for i in range(config.MAX_EXPECTED_DIFF_NICKS)]
            for line in day_log:
                flag_comma = 0

                if(util.check_if_msg_line (line)):
                    parsed_nick = re.search(r"\<(.*?)\>", line)
                    corrected_nick = util.correctLastCharCR(parsed_nick.group(0)[1:-1])
                    nick_sender = ""
                    nick_receiver = ""                    
                    nick_sender = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, corrected_nick, conn_comp_list, nick_sender)        

                    for nick in nicks:
                        rec_list = [e.strip() for e in line.split(':')]
                        rec_list = util.rec_list_splice(rec_list)
                        if not rec_list[1]:
                            break                        
                        rec_list = util.correct_last_char_list(rec_list)       
                        msg_no_analysis_helper(rec_list, corrected_nick, nick, conn_comp_list, conversations,today_conversation)

                        if "," in rec_list[1]:
                            flag_comma = 1
                            rec_list_2=[e.strip() for e in rec_list[1].split(',')]
                            for i in xrange(0,len(rec_list_2)):
                                if(rec_list_2[i]):
                                    rec_list_2[i] = util.correctLastCharCR(rec_list_2[i])                            
                            msg_no_analysis_helper(rec_list_2, corrected_nick, nick, conn_comp_list, conversations, today_conversation)                

                        if(flag_comma == 0):
                            rec = line[line.find(">")+1:line.find(", ")]
                            rec = rec[1:]
                            rec = util.correctLastCharCR(rec)
                            if(rec == nick):
                                if(corrected_nick != nick):                                   
                                    nick_receiver = nick_receiver_from_conn_comp(nick, conn_comp_list)        

            if DAY_BY_DAY_ANALYSIS:
                today_message_number_graph = nx.DiGraph()
                today_message_number_graph = message_no_add_egde(today_message_number_graph, today_conversation)                
                year, month, day = util.get_year_month_day(day_content)
                message_number_day_list.append([today_message_number_graph, year+'-'+month+'-'+day])

    print "\nBuilding graph object with EDGE WEIGHT THRESHOLD:", config.THRESHOLD_MESSAGE_NUMBER_GRAPH

    if not DAY_BY_DAY_ANALYSIS:
        aggregate_message_number_graph = message_no_add_egde(aggregate_message_number_graph, conversations)
        

    if config.DEBUGGER:
        print "========> 30 on " + str(len(conversations)) + " conversations"
        print conversations[:30]

    if DAY_BY_DAY_ANALYSIS:
        return message_number_day_list
    else:
        return aggregate_message_number_graph


[docs]def channel_user_presence_graph_and_csv(nicks, nick_same_list, channels_for_user, nick_channel_dict, nicks_hash, channels_hash):
    """ creates a directed graph for each nick,
    each edge from which points to the IRC Channels that nick has participated in.
    (Nick changes are tracked here and only the initial nick is shown if a user changed his nick)

    Args:
        nicks(list): list of all the nicks
        nick_same_list(list): list of lists mentioning nicks which belong to same users
        channels_for_user(dict): dictionary with keys as nicks and value as list of 
        channels on which user with nick is present
        nick_channel_dict(dict):  channels and nicks present on them
        nicks_hash(list): hash values of nicks
        channels_hash(list): hash values of channels

    Returns:
        presence_graph_and_matrix (dict): contains adjacency matrices and graphs for Acc Auu Acu
        full_presence_graph (nx graph object)
    """

    presence_graph_and_matrix = {
        "CC": {
                "graph": None,
                "matrix": None,
                "reducedMatrix": None,
                "reducedGraph": None
        },
        "CU": {
                "graph": None,
                "matrix": None,
                "reducedMatrix": None,
                "reducedGraph": None
        },
        "UU": {
                "graph": None,
                "matrix": None,
                "reducedMatrix": None,
                "reducedGraph": None
        },
    }

    users_on_channel = {}
    full_presence_graph = nx.DiGraph()  #directed

    def create_adj_matrix(hashed_list1, hashed_list2):
        adj_matrix = [[0] * len(hashed_list1) for i in range(len(hashed_list2))]
        return adj_matrix

    def add_channel_weighted_edge(graph, adjlist, nicks_hash, channels_hash, channel):
        graph.add_edge(nicks_hash.index(adjlist['nickname']), (config.STARTING_HASH_CHANNEL + channels_hash.index(channel[0])), weight=channel[1])
        return graph   
    
    #====================== CHANNEL_USER ============================
    channel_user_graph = nx.Graph()    
    CU_adjacency_matrix = create_adj_matrix(nicks_hash, channels_hash)
    for adjlist in nick_channel_dict:
        for channel in adjlist['channels']:
            
            if channel[1] > config.FILTER_FOR_CHANNEL_USER_GRAPH:
                # print str(nicks_hash.index(adjlist['nickname']))+"\t"+str(config.STARTING_HASH_CHANNEL +channels_hash.index(channel[0]))
                channel_user_graph = add_channel_weighted_edge(channel_user_graph, adjlist, nicks_hash, channels_hash, channel)
                full_presence_graph = add_channel_weighted_edge(full_presence_graph, adjlist, nicks_hash, channels_hash, channel)                

            # print nicks_hash.index(adjlist['nickname']),adjlist['nickname'], channels_hash.index(channel[0]),channel[0], channel[1]
            CU_adjacency_matrix[channels_hash.index(channel[0])][nicks_hash.index(adjlist['nickname'])] = channel[1]
            # print adjlist['nickname'], channel[0]
           
            if users_on_channel.has_key(channel[0]):
                if adjlist['nickname'] not in users_on_channel[channel[0]]:
                    users_on_channel[channel[0]].append(adjlist['nickname'])
            else:
                users_on_channel[channel[0]] = [adjlist['nickname']]
    
    presence_graph_and_matrix["CU"]["matrix"] = CU_adjacency_matrix
    presence_graph_and_matrix["CU"]["graph"] = channel_user_graph  
    print "CU Adjacency Matrix Generated"

    #====================== CHANNEL_CHANNEL ============================
    channel_channel_graph = nx.Graph()    
    CC_adjacency_matrix = create_adj_matrix(channels_hash, channels_hash)

    def add_common_users_weighted_edge(graph, index1, index2, common_users):
        graph.add_edge(str(config.STARTING_HASH_CHANNEL + index1), str(config.STARTING_HASH_CHANNEL + index2), weight=len(common_users))
        return graph

    for i in xrange(0, len(channels_hash)):
        for j in xrange(i+1, len(channels_hash)):
            common_users = list(set(users_on_channel[channels_hash[i]]) & set(users_on_channel[channels_hash[j]]))
            # print users_on_channel.keys()[i], users_on_channel.keys()[j], common_users
            CC_adjacency_matrix[i][j] = len(common_users)
            CC_adjacency_matrix[j][i] = len(common_users)
            if len(common_users) > config.FILTER_FOR_CHANNEL_CHANNEL_GRAPH:
                full_presence_graph = add_common_users_weighted_edge(full_presence_graph, i, j, common_users)
                full_presence_graph = add_common_users_weighted_edge(full_presence_graph, j, i, common_users)
                '''full_presence_graph.add_edge(str(config.STARTING_HASH_CHANNEL + i), str(config.STARTING_HASH_CHANNEL + j), weight=len(common_users))
                full_presence_graph.add_edge(str(config.STARTING_HASH_CHANNEL + j), str(config.STARTING_HASH_CHANNEL + i), weight=len(common_users))'''
                # "Uncomment for directed version"
                # print str(channels_hash.index(users_on_channel.keys()[i]))+"\t"+str(channels_hash.index(users_on_channel.keys()[j]))
                # print str(channels_hash.index(users_on_channel.keys()[j]))+"\t"+str(channels_hash.index(users_on_channel.keys()[i]))
                # print channels_hash[i],channels_hash[j]
                channel_channel_graph = add_common_users_weighted_edge(channel_channel_graph, i , j, common_users)
                '''channel_channel_graph.add_edge(str(config.STARTING_HASH_CHANNEL + i), str(config.STARTING_HASH_CHANNEL + j), weight=len(common_users))'''
    
    presence_graph_and_matrix["CC"]["matrix"] = CC_adjacency_matrix
    presence_graph_and_matrix["CC"]["graph"] = channel_channel_graph
    print "CC Adjacency Matrix Generated"
    
    #====================== USER_USER ============================
    user_user_graph = nx.Graph()   
    UU_adjacency_matrix = create_adj_matrix(nicks_hash, nicks_hash)
    for user_channel_dict in channels_for_user:
        # user_channel_dict format : {nick : [channels on that day], }
        for i in xrange(0, len(user_channel_dict.keys())):
            for j in xrange(i+1, len(user_channel_dict.keys())):
                common_channels_on_that_day = list(set(user_channel_dict[user_channel_dict.keys()[i]]) & set(user_channel_dict[user_channel_dict.keys()[j]]))
                # print user_channel_dict.keys()[i], user_channel_dict.keys()[j], common_channels_on_that_day
                user1 = user_channel_dict.keys()[i]
                user2 = user_channel_dict.keys()[j]
                no_of_common_channels_day = len(common_channels_on_that_day)
                #print str(nicks_hash.index(user1))+"\t"+str(nicks_hash.index(user2))
                # "Uncomment for directed version"
                # print str(nicks_hash.index(user2))+"\t"+str(nicks_hash.index(user1))
                # print user1, user2
                UU_adjacency_matrix[nicks_hash.index(user1)][nicks_hash.index(user2)] += no_of_common_channels_day
                UU_adjacency_matrix[nicks_hash.index(user2)][nicks_hash.index(user1)] += no_of_common_channels_day

    for i in range(len(nicks_hash)):
        for j in range(i):
            if UU_adjacency_matrix[i][j] > config.FILTER_FOR_USER_USER_GRAPH:
                # print str(i)+'\t'+str(j)
                user_user_graph.add_edge(i, j, weight=UU_adjacency_matrix[i][j])
                full_presence_graph.add_edge(i, j, weight=UU_adjacency_matrix[i][j])
                full_presence_graph.add_edge(j, i, weight=UU_adjacency_matrix[i][j])

    presence_graph_and_matrix["UU"]["matrix"] = UU_adjacency_matrix
    presence_graph_and_matrix["UU"]["graph"] = user_user_graph
    print "UU Adjacency Matrix Generated"

    def print_node_degree(nodes, max_degree_possible):
        for i in range(max_degree_possible):
            print "deg"+str(i)+'\t'+str(nodes[i])

    degree_map = {"out": full_presence_graph.out_degree().values(), "in": full_presence_graph.in_degree().values(), "all": full_presence_graph.degree().values()}

    def inc_degree(degree_list, nodes, max_degree_possible):        
        for degree in degree_list:
            if not degree < max_degree_possible:
                print "===error", degree
            else:
                nodes[degree] += 1    
        return nodes

    #=========================================================================
    if config.GENERATE_DEGREE_ANAL:

        max_degree_possible = config.CHANNEL_USER_MAX_DEG

        nodes_with_OUT_degree = [0]*max_degree_possible
        nodes_with_IN_degree = [0]*max_degree_possible
        nodes_with_TOTAL_degree = [0]*max_degree_possible
        
        # print full_presence_graph.out_degree().values()

        nodes_with_OUT_degree = inc_degree(degree_map["out"], nodes_with_OUT_degree, max_degree_possible)
        nodes_with_IN_degree = inc_degree(degree_map["in"], nodes_with_IN_degree, max_degree_possible)
        nodes_with_TOTAL_degree = inc_degree(degree_map["all"], nodes_with_TOTAL_degree, max_degree_possible)        

        print "========= OUT DEGREE ======="        
        print_node_degree(nodes_with_OUT_degree, max_degree_possible) 

        print "========= IN DEGREE ======="        
        print_node_degree(nodes_with_IN_degree, max_degree_possible) 

        print "========= TOTAL DEGREE ======="        
        print_node_degree(nodes_with_TOTAL_degree, max_degree_possible) 

    #=========================================================================
    '''
        We have around 20k users and most of them just visit a channel once,
        hence we filter out the top 100 users
        This inturns reduces the CU and UU matrices
    '''

    '''
        calculate top <how_many_top_users> users
        this is achieved by taking top users from CU matrix on the basis of the column sum (total number of days active on a channel)
    # '''
    how_many_top_users = config.FILTER_TOP_USERS

    '''
     we also need to filter the channels and are filtered on the basis of row sum of CC matrix
    '''
    how_many_top_channels = config.FILTER_TOP_CHANNELS

    sum_for_each_channel = []
    for channel_row in CC_adjacency_matrix:
        sum_for_each_channel.append(sum(channel_row))

    def get_top_indices(sum_list, how_many_vals):        
        return sorted(range(len(sum_list)), key=lambda i: sum_list[i], reverse=True)[:how_many_vals]

    def get_indices_to_delete(hash_list, top_indices):
        return list(set([i for i in range(len(hash_list))]) - set(top_indices))

    #filter out top <how_many_top_channels> indices
    top_indices_channels = get_top_indices(sum_for_each_channel, how_many_top_channels)
    indices_to_delete_channels = get_indices_to_delete(channels_hash, top_indices_channels)
   
    temp_channels = np.delete(CC_adjacency_matrix, indices_to_delete_channels, 1) #delete columns
    reduced_CC_adjacency_matrix = np.delete(temp_channels, indices_to_delete_channels, 0) #delete rows
    presence_graph_and_matrix["CC"]["reducedMatrix"] = reduced_CC_adjacency_matrix
    
    reduced_CC_graph = channel_channel_graph.copy()
    reduced_CC_graph.remove_nodes_from(map(str, np.array(indices_to_delete_channels) + config.STARTING_HASH_CHANNEL)) # say the indices to remove are 1,2 presence_graph_and_matrix["CC"]["reducedGraph"] = reduced_CC_graph 
    presence_graph_and_matrix["CC"]["reducedGraph"] = reduced_CC_graph
    print "Generated Reduced CC Adjacency Matrix"

    #to calculate sum first take the transpose of CU matrix so users in row
    UC_adjacency_matrix = zip(*CU_adjacency_matrix)
    sum_for_each_user = []

    for user_row in UC_adjacency_matrix:
        sum_for_each_user.append(sum(user_row))

    #filter out top <how_many_top_users> indices
    top_indices_users = get_top_indices(sum_for_each_user, how_many_top_users)
    indices_to_delete_users = get_indices_to_delete(nicks_hash, top_indices_users)    

    # print len(top_indices_users), top_indices_users
    # print len(indices_to_delete_users), indices_to_delete_users

    #update the nick_hash, channel_hash
    reduced_nick_hash = np.delete(nicks_hash, indices_to_delete_users)
    reduced_channel_hash = np.delete(channels_hash, indices_to_delete_channels)


    #update the CU matrix by deleting particular columns, and rows which are not in top_indices_users, channels
    temp_user_channel = np.delete(CU_adjacency_matrix, indices_to_delete_users, 1) #delete columns
    reduced_CU_adjacency_matrix = np.delete(temp_user_channel, indices_to_delete_channels, 0) #delete rows
    
    reduced_CU_graph = channel_user_graph.copy()
    reduced_CU_graph.remove_nodes_from(np.array(indices_to_delete_channels) + config.STARTING_HASH_CHANNEL) #remove non top channels_
    reduced_CU_graph.remove_nodes_from(np.array(indices_to_delete_users)) #remove users
    presence_graph_and_matrix["CU"]["reducedGraph"] = reduced_CU_graph

    print "Generated Reduced CU Adjacency Matrix"
    presence_graph_and_matrix["CU"]["reducedMatrix"] = reduced_CU_adjacency_matrix

    #update the UU matrix by deleting both columns and rows
    temp_users = np.delete(UU_adjacency_matrix, indices_to_delete_users, 1) #delete columns
    reduced_UU_adjacency_matrix = np.delete(temp_users, indices_to_delete_users, 0) #delete rows
    
    reduced_UU_graph = user_user_graph.copy()
    reduced_UU_graph.remove_nodes_from(np.array(indices_to_delete_users))
    presence_graph_and_matrix["UU"]["reducedGraph"] = reduced_UU_graph
    
    print "Generated Reduced UU Adjacency Matrix"
    presence_graph_and_matrix["UU"]["reducedMatrix"] = reduced_UU_adjacency_matrix

    if config.PRINT_CHANNEL_USER_HASH:
        print "=================================================="

        print "========= REDUCED NICK HASH ========="
        for i in range(len(reduced_nick_hash)):
            print str(i)+"\t"+reduced_nick_hash[i]
        
        print "========= REDUCED CHANNEL HASH ========="
        for i in range(len(reduced_channel_hash)):
            print str(config.STARTING_HASH_CHANNEL + i)+"\t"+reduced_channel_hash[i]

    return presence_graph_and_matrix, full_presence_graph


[docs]def filter_edge_list(edgelist_file_loc, max_hash, how_many_top):
    """
    reduces the edge list by selecting top nodes through degree analysis
    
    Arguments:
        edgelist_file_loc (str): location of the edgelist file
        max_hash (int): max possinle value of the node_hash in edgelist
        how_many_top (int): how many top nodes to select in the new edgeList
    Returns:
        null
    """
    node_list = []
    degrees = [0] * max_hash

    with open(edgelist_file_loc) as f:
        content = f.readlines()
        for line in content:
            a, b = line.split()
            node_list.append(int(a))
            node_list.append(int(b))
            degrees[int(a)] += 1
            degrees[int(b)] += 1

    print "Done Pre Computation"
    print "Max_hash", max(node_list)

    max_hash = max(node_list)
    degrees = np.array(degrees)

    print "========TOP "+str(how_many_top)+" NODES ON BASIS OF DEGREE ========"

    top_nodes = list(degrees.argsort()[::-1])[:how_many_top]
    # print top_nodes
    print "======= UPDATED ADJACENY LIST ON THE BASIS OF ABOVE NODES ======="

    with open(edgelist_file_loc) as f:
        content = f.readlines()
        for line in content:
            a, b = map(int, line.split())
            if a in top_nodes and b in top_nodes:
                print str(a) + "\t" + str(b)

[docs]def degree_analysis_on_graph(nx_graph, date=None, directed = True):
    """
    perform degree analysis of input graph object
    
    Arguments:
        nx_graph (nx_object): object to perform analysis on
        date(string): timestamp
        directed(boolean): True if nx_graph is directed else False
    Returns:
        dictionary: with in_degree, out_degree & total_degree for directed graphs
                    and degree as key for undirected_graphs
    """
    
    def nodes_with_degree_populator(degree_values, label): 
        nodes_with_degree = []
        if len(degree_values):
            nodes_with_degree = [[label + str(i), 0, ''] for i in xrange((max(degree_values)+1))]
        else:
            nodes_with_degree = [["NA", 0, "NA"]]

        for degree in degree_values:
            nodes_with_degree[degree][1] += 1

        return nodes_with_degree

    def give_userlist_where_degree_helper(degree_dict, degree):
        key_list = ""
        for key in degree_dict:
            if degree_dict[key] == degree:
                key_list += (str(key) + ", ")
        return key_list

    degree_map = {} # will map a string(eg "out", "in" , "all") to nx_graph.out_degree() etc
    
    def raw_node_append(nodes, raw, degree_type):        
        """
        Args:
            nodes(List) : nodes_with_OUT/IN/TOTAL degree\
            raw(List) : raw_in/out/total
            degree_type(str) : "in" "out" or "all" , basically keys of degree_map          
        Returns:
            raw(List)
            nodes(List)
        """  
        for i in range(1, len(nodes)):
            raw.append(nodes[i][1]) # raw will store the number of nodes with degree 0 in position 1, # of nodes with degree 1 in position 2 etc
            nodes[i][2] = give_userlist_where_degree_helper(degree_map[degree_type], i - 1)
        return raw, nodes
 
    if directed:
        nodes_with_OUT_degree = nodes_with_degree_populator(nx_graph.out_degree().values(), "nodes_w_out_deg")
   
        nodes_with_IN_degree = nodes_with_degree_populator(nx_graph.in_degree().values(), "nodes_w_in_deg")
        nodes_with_TOTAL_degree = nodes_with_degree_populator(nx_graph.degree().values(), "nodes_w_deg")    

        nodes_with_OUT_degree.insert(0, ["total_nodes", sum(data[1] for data in nodes_with_OUT_degree)]) # sum of (number of nodes with degree 1 +number of nodes with degre 2..)

        nodes_with_IN_degree.insert(0, ["total_nodes", sum(data[1] for data in nodes_with_IN_degree)])

        nodes_with_TOTAL_degree.insert(0, ["total_nodes", sum(data[1] for data in nodes_with_TOTAL_degree)])


        raw_out = [str(date)]
        raw_in = [str(date)]
        raw_total = [str(date)]

        degree_map = {"out":nx_graph.out_degree(),"in":nx_graph.in_degree(),"all":nx_graph.degree()}  

        raw_out, nodes_with_OUT_degree = raw_node_append(nodes_with_OUT_degree, raw_out, "out")
        raw_in, nodes_with_IN_degree = raw_node_append(nodes_with_IN_degree, raw_in, "in")
        raw_total, nodes_with_TOTAL_degree = raw_node_append(nodes_with_TOTAL_degree, raw_total, "all")   

        return {
            "out_degree": {
                "formatted_for_csv": nodes_with_OUT_degree,
                "raw_for_vis": raw_out
                },
            "in_degree": {
                "formatted_for_csv": nodes_with_IN_degree,
                "raw_for_vis": raw_in
                },
            "total_degree": {
                "formatted_for_csv": nodes_with_TOTAL_degree,
                "raw_for_vis": raw_total
                }
            }
    # for undirected        
    else:        
        nodes_with_degree_undirected = nodes_with_degree_populator(nx_graph.degree().values(), "nodes_w_deg")
        nodes_with_degree_undirected.insert(0, ["total_nodes", sum(data[1] for data in nodes_with_degree_undirected)])
        raw_degree = [str(date)]
        degree_map = {"all":nx_graph.degree()}
        raw_degree, nodes_with_degree_undirected = raw_node_append(nodes_with_degree_undirected, raw_degree, "all")

        return  {
            "degree":{
                "formatted_for_csv" : nodes_with_degree_undirected,
                "raw_for_vis" : raw_degree
            }   
         }

[docs]def message_time_graph(log_dict, nicks, nick_same_list, DAY_BY_DAY_ANALYSIS=False):
    """ creates a directed graph where each edge denotes a message sent from a user to another user
    with the stamp denoting the time at which the message was sent

    Args:
        log_dict (dictionary): Dictionary of logs data created using reader.py
        nicks(List) : List of nickname created using nickTracker.py
        nick_same_list(List) :List of same_nick names created using nickTracker.py
        DAY_BY_DAY_ANALYSIS: True if graphs are produced for each day 

    Returns:
       msg_time_graph_list(List): List of message time graphs for different days
       msg_time_aggr_graph: aggregate message time graph where edges are date + time when sender sends a message to receiver
    """  
    msg_time_graph_list = []
    msg_time_aggr_graph = nx.MultiDiGraph()
    G = util.to_graph(nick_same_list)
    conn_comp_list = list(connected_components(G))

    def compare_spliced_nick(nick_to_compare, spliced_nick, nick_name, line):
        if(nick_to_compare == nick_name):
            if(spliced_nick != nick_name):
                nick_receiver = nick_receiver_from_conn_comp(nick_name, conn_comp_list)        
                util.build_graphs(nick_sender, nick_receiver, line[1:6], year, month, day, graph_conversation, msg_time_aggr_graph)             
     
    conn_comp_list = util.create_connected_nick_list(conn_comp_list)

    for day_content_all_channels in log_dict.values():
        for day_content in day_content_all_channels:
            day_log = day_content["log_data"]
            year, month, day = util.get_year_month_day(day_content)
            graph_conversation = nx.MultiDiGraph()  #graph with multiple directed edges between clients used
            for line in day_log:
                flag_comma = 0
                if(util.check_if_msg_line (line)):
                    m = re.search(r"\<(.*?)\>", line)         
                    spliced_nick = util.correctLastCharCR(m.group(0)[1:-1])
                    nick_sender = ""                          
                    nick_sender = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, spliced_nick, conn_comp_list, nick_sender)

                    for nick_name in nicks:
                        rec_list = [e.strip() for e in line.split(':')]  #receiver list splited about :
                        rec_list = util.rec_list_splice(rec_list)
                        if not rec_list[1]:  #index 0 will contain time 14:02
                            break                        
                        rec_list = util.correct_last_char_list(rec_list)        
                        for nick_to_search in rec_list:
                            if(nick_to_search == nick_name):
                                if(spliced_nick != nick_name):                                    
                                    nick_receiver = ""                                         
                                    nick_receiver = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, nick_name, conn_comp_list, nick_receiver)                                            
                                    util.build_graphs(nick_sender, nick_receiver, line[1:6], year, month, day, graph_conversation, msg_time_aggr_graph)

                        if "," in rec_list[1]:  #receiver list may of the form <Dhruv> Rohan, Ram :
                            flag_comma = 1
                            rec_list_2 = [e.strip() for e in rec_list[1].split(',')]
                            rec_list_2 = util.correct_last_char_list(rec_list_2)        
                            for nick_to_search in rec_list_2:                              
                                compare_spliced_nick(nick_to_search, spliced_nick, nick_name, line)   

                        if(flag_comma == 0):  #receiver list can be <Dhruv> Rohan, Hi!
                            rec = line[line.find(">") + 1:line.find(", ")]
                            rec = util.correctLastCharCR(rec[1:])                           
                            compare_spliced_nick(rec, spliced_nick, nick_name, line)    

            msg_time_graph_list.append(graph_conversation)

    if DAY_BY_DAY_ANALYSIS:
        return msg_time_graph_list
    else:
        return msg_time_aggr_graph


[docs]def message_number_bins_csv(log_dict, nicks, nick_same_list):
    """ creates a CSV file which tracks the number of message exchanged in a channel 
        for 48 bins of half an hour each distributed all over the day 
        aggragated over the year.

    Args:
        log_dict (dictionary): Dictionary of logs data created using reader.py
        nicks(List) : List of nickname created using nickTracker.py
        nick_same_list(List) :List of same_nick names created using nickTracker.p

    Returns:
       bin_matrix(list of lists): a list of lists of 48 bins with number of messages sent in each bin
       tot_msgs: total messages exchanged
    """   
    
    no_of_bins = (config.HOURS_PER_DAY * config.MINS_PER_HOUR) / config.BIN_LENGTH_MINS
    tot_msgs = [0] * no_of_bins
    bin_matrix = []

    def bin_increment(nick_name, messager, nick_spliced, bins, bin_index):
        if(nick_name == messager):
            if(nick_spliced != messager):  
                bins[bin_index] = bins[bin_index] + 1 
            
    for day_content_all_channels in log_dict.values():
        for day_content in day_content_all_channels:
            day_log = day_content["log_data"]
            bins = [0] * no_of_bins

            for line in day_log:
                if(line[0] == '['):
                    time_in_min = int(line[1:3]) * 60 + int(line[4:6])

                    bin_index = time_in_min / config.BIN_LENGTH_MINS #gives the index of the bin for eg 01:35 in mins is 95 then 95/30 --> 3 , puts it in bin index 3
                    
                    flag_comma = 0 # sometimes messages are sent to users like [22:55] <Rohan> Krishna, i hope we are on track. and sometimes like [22:56] <Krishna> Rohan: Yes it is.  

                    if(line[0] != '=' and "] <" in line and "> " in line):
                        m = re.search(r"\<(.*?)\>", line)
                        nick_spliced = util.correctLastCharCR(m.group(0)[1:-1])
                        
                        for messager in nicks:
                            rec_list = [e.strip() for e in line.split(':')]
                            rec_list = util.rec_list_splice(rec_list)
                            if not rec_list[1]:
                                break                            
                            rec_list = util.correct_last_char_list(rec_list)
                            for nick_name in rec_list:
                                bin_increment(nick_name, messager, nick_spliced, bins, bin_index)                                                                                               
                                            
                            if "," in rec_list[1]: 
                                flag_comma = 1
                                rec_list = [e.strip() for e in rec_list[1].split(',')]                              
                                rec_list = util.correct_last_char_list(rec_list)
                                for nick_name in rec_list:
                                    bin_increment(nick_name, messager, nick_spliced, bins, bin_index)                                            
                        
                            if(flag_comma == 0):
                                rec = line[line.find(">")+1:line.find(", ")][1:]
                                rec = util.correctLastCharCR(rec)
                                bin_increment(rec, messager, nick_spliced, bins, bin_index)                               
                                       

            bin_matrix.append(bins)      
            tot_msgs = [tot_msgs[i] + bins[i] for i in range(len(bins))]
            
    return bin_matrix, sum(tot_msgs)

[docs]def degree_node_number_csv(log_dict, nicks, nick_same_list):
    """ 
    creates two csv files having no. of nodes with a certain in and out-degree
    for number of nodes it interacted with, respectively.
    Also gives graphs for log(degree) vs log(no. of nodes)
    and tries to find it's equation by curve fitting
    
    Args:
        log_dict (dict): with key as dateTime.date object and value as {"data":datalist,"channel_name":channels name}
        nicks(list): list of all the nicks
        nick_same_list(list): list of lists mentioning nicks which belong to same users
    Returns:
        out_degree (list)
        in_degree (list)
        total_degree (list)
    """

    msg_num_graph_day_list = message_number_graph(log_dict, nicks, nick_same_list, True)
    degree_analysis_day_list = []

    for day_graph_list in msg_num_graph_day_list:
        day_graph = day_graph_list[0]
        degree_analysis_day_list.append(degree_analysis_on_graph(day_graph, day_graph_list[1]))

    out_degree = []
    in_degree = []
    total_degree = []
    max_in_degree = 0
    max_out_degree = 0
    max_total_degree = 0

    for degree_analysis in degree_analysis_day_list:
        out_degree.append(degree_analysis["out_degree"]["raw_for_vis"])
        in_degree.append(degree_analysis["in_degree"]["raw_for_vis"])
        total_degree.append(degree_analysis["total_degree"]["raw_for_vis"])
        max_out_degree = max(max_out_degree, len(degree_analysis["out_degree"]["raw_for_vis"]))
        max_in_degree = max(max_in_degree, len(degree_analysis["in_degree"]["raw_for_vis"]))
        max_total_degree = max(max_total_degree, len(degree_analysis["total_degree"]["raw_for_vis"]))

    def format_degree_list(degree_list, max_range, degree_type):
        degree_head_row = ["deg"+str(i) for i in range(max_range)]
        degree_head_row.insert(0, degree_type)
        degree_list.insert(0, degree_head_row)
        degree_list = list(zip_longest(*degree_list, fillvalue=0))

        return degree_list

    out_degree = format_degree_list(out_degree, max_out_degree, "out_degree")
    in_degree = format_degree_list(in_degree, max_in_degree, "in_degree")
    total_degree = format_degree_list(total_degree, max_total_degree, "total_degree")

    return out_degree, in_degree, total_degree

[docs]def nick_receiver_from_conn_comp(nick, conn_comp_list):
    """
        creates nick_receiver from conn_comp_list,
        it is a helper function used in create_message_time_graph and message_number_graph
    """
    nick_receiver = ""
    
    for i in range(config.MAX_EXPECTED_DIFF_NICKS):
        if nick in conn_comp_list[i]:
            nick_receiver = conn_comp_list[i][0]
            break
    return nick_receiver


[docs]def identify_hubs_and_experts(log_dict, nicks, nick_same_list):
    """
        uses message_number graph to identify hubs and experts in the network

    Args:
        log_dict (dict): with key as dateTime.date object and value as {"data":datalist,"channel_name":channels name}
        nicks(list): list of all the nicks
        nick_same_list(list): list of lists mentioning nicks which belong to same users
    Returns:
        message_graph(nx graph): message number graph
        top_hub(list): list of top hubs
        top_keyword_overlap(list): top users from keywords digest
        top_auth: list of top authorities
    """
    message_graph = message_number_graph(log_dict, nicks, nick_same_list)
    hubs, authority_values = nx.hits(message_graph)

    keyword_dict_list, user_keyword_freq_dict, user_words_dict_list, nicks_for_stop_words, keywords_for_channels = user.keywords(log_dict, nicks, nick_same_list)
    if config.DEBUGGER:
        print "========> USERS"
        print user_keyword_freq_dict
        print "========> CHANNELS"
        print keywords_for_channels, len(keywords_for_channels)

    top_keywords_for_channels = []
    for word_tuple in keywords_for_channels[:config.NUMBER_OF_KEYWORDS_CHANNEL_FOR_OVERLAP]:
        top_keywords_for_channels.append(word_tuple[0])

    overlap_word_number = []
    for keyword_tuple in user_keyword_freq_dict:
        keywords_for_user = keyword_tuple['keywords']
        username = keyword_tuple['nick']
        overlapping_keywords = list(set(top_keywords_for_channels).intersection([x[0] for x in keywords_for_user]))
        if len(overlapping_keywords) > 0:
            overlap_word_number.append([username, len(overlapping_keywords)])

    top_hubs_with_score = util.find_top_n_element_after_sorting(hubs.items(), 1, True, config.HOW_MANY_TOP_EXPERTS)
    top_auth_with_score = util.find_top_n_element_after_sorting(authority_values.items(), 1, True, config.HOW_MANY_TOP_EXPERTS)
    top_keyword_overlap_with_score = util.find_top_n_element_after_sorting(overlap_word_number, 1, True, config.HOW_MANY_TOP_EXPERTS)

    print "TOP " + str(config.HOW_MANY_TOP_EXPERTS) + " HUBS\n", top_hubs_with_score
    print "TOP " + str(config.HOW_MANY_TOP_EXPERTS) + " AUTH\n", top_auth_with_score
    print "TOP " + str(config.HOW_MANY_TOP_EXPERTS) + " KEYWORD OVERLAP\n", top_keyword_overlap_with_score

    top_hub = [hub_tuple[0] for hub_tuple in top_hubs_with_score]
    top_auth = [auth_tuple[0] for auth_tuple in top_auth_with_score]
    top_keyword_overlap = [key_overlap_tuple[0] for key_overlap_tuple in top_keyword_overlap_with_score]

    for node_name in message_graph:
        # mark EXPERTS
        message_graph.node[node_name]['style'] = 'filled'
        if node_name in top_auth and node_name in top_keyword_overlap:
            message_graph.node[node_name]['color'] = '#ff000'
        elif node_name in top_auth:
            message_graph.node[node_name]['color'] = '#00ff00'
        elif node_name in top_keyword_overlap:
            message_graph.node[node_name]['color'] = '#0000ff'
        else:
            message_graph.node[node_name]['color'] = '#cccccc'
        # mark HUBS
        if node_name in top_hub:
            message_graph.node[node_name]['shape'] = 'square'

    return message_graph, top_hub, top_keyword_overlap, top_auth