import sys, gzip, Queue def int_of_ip(ip): r = 0 for x in ip.split('.'): r *= 256 r += int(x) return(r) def read_tree(filename): f = gzip.open(filename) t = {} for l in f: [son,father] = l.split() t[son] = father return(t) def read_ip_tree(filename): f = gzip.open(filename) t = {} for l in f: [son,father] = l.split() if son[0].isdigit() and father[0].isdigit(): t[int_of_ip(son)] = int_of_ip(father) return(t) def connected_components(g): visited = {} for v in g.iterkeys(): visited[v] = 0 n_cc = 0 q = Queue.Queue() for v in g.iterkeys(): if visited[v] == 0: n_cc += 1 q.put(v) visited[v] = n_cc while not q.empty(): v = q.get() for u in g[v]: if visited[u] == 0: visited[u] = n_cc q.put(u) cc_size_nodes = [0 for i in xrange(n_cc)] for v in g.iterkeys(): cc_size_nodes[visited[v]-1] += 1 cc_size_links = [0 for i in xrange(n_cc)] for v in g.iterkeys(): for u in g[v]: cc_size_links[visited[v]-1] += 1 for i in xrange(n_cc): cc_size_links[i] /= 2 if n_cc>0: size_largest_cc = max(cc_size_nodes) else: size_largest_cc = None return(n_cc,size_largest_cc,cc_size_nodes,cc_size_links) def distance(g,x,y): res = {} if x not in g or y not in g: return(-2) for v in g.iterkeys(): res[v] = -1 q = Queue.Queue() q.put(x) res[x] = 0 while not q.empty(): v = q.get() for u in g[v]: if res[u] == -1: res[u] = res[v] + 1 if u == y: return(res[u]) q.put(u) return(res[y]) def output_cc(f,i,(n_cc,size_largest_cc,cc_size_nodes,cc_size_links)): for x in xrange(n_cc): n = cc_size_nodes[x] m = cc_size_links[x] avg_deg = 0 if n>0: avg_deg = (2.*m)/n density = 0 if n>1: density = (2.*m)/(n*(n-1.)) f.write("%d %d %d %d %d %f %f\n"%(i,n_cc,size_largest_cc,n,m,avg_deg,density)) f.flush() def make_subgraph(nodes,links): g = {} for x in nodes: g[x] = set() for x,y in links: if x in nodes and y in nodes: g[x].add(y) g[y].add(x) return(g) def make_subgraph_d1(nodes,links): g = {} for x in nodes: g[x] = set() for x,y in links: if x in nodes or y in nodes: g[x] = set() g[y] = set() for x,y in links: if x in nodes or y in nodes: g[x].add(y) g[y].add(x) return(g) if __name__ == '__main__': if len(sys.argv) != 5: sys.stderr.write("Usage: %s input_dir begin end out_dir\n"%sys.argv[0]) sys.exit(1) input_dir = sys.argv[1]+"/" begin = int(sys.argv[2]) end = int(sys.argv[3]) out_dir = sys.argv[4]+'/' intervals = set() l = [1,2,3,5,10,50,100] for pre in l: for cur in l: intervals.add((pre,cur)) intervals = set() intervals.add((5,1)) intervals.add((10,1)) intervals.add((50,1)) intervals.add((5,2)) intervals.add((10,2)) intervals.add((50,2)) intervals.add((5,5)) intervals.add((10,5)) intervals.add((50,5)) pre_set = set([x for (x,y) in intervals]) cur_set = set([y for (x,y) in intervals]) max_pre = max(pre_set) max_cur = max(cur_set) sys.stderr.write("max_pre = %d max_cur = %d\n"%(max_pre,max_cur)) sys.stderr.flush() # we will keep max_pre+max_cur steps in memory nodes = [set() for i in xrange(max_pre+max_cur)] links = [set() for i in xrange(max_pre+max_cur)] directed_links = [set() for i in xrange(max_pre+max_cur)] all_nodes = set() all_links = set() # we will create two files for each (x,y) in intervals out_file = {} out_dist_file = {} out_cc_file_Up_Uc = {} out_cc_file_Uc_Up = {} out_cc_file_Ic_Up = {} out_cc_file_Ip_Uc = {} out_cc_file_Up_Uc_d1 = {} out_cc_file_Uc_Up_d1 = {} out_cc_file_Ic_Up_d1 = {} out_cc_file_Ip_Uc_d1 = {} for (pre,cur) in intervals: out = gzip.open(out_dir+"out_"+str(pre)+"_"+str(cur)+".gz","w") out_dist = gzip.open(out_dir+"out_dist_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Up_Uc = gzip.open(out_dir+"cc_Up_Uc_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Uc_Up = gzip.open(out_dir+"cc_Uc_Up_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Ic_Up = gzip.open(out_dir+"cc_Ic_Up_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Ip_Uc = gzip.open(out_dir+"cc_Ip_Uc_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Up_Uc_d1 = gzip.open(out_dir+"cc_Up_Uc_d1_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Uc_Up_d1 = gzip.open(out_dir+"cc_Uc_Up_d1_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Ic_Up_d1 = gzip.open(out_dir+"cc_Ic_Up_d1_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Ip_Uc_d1 = gzip.open(out_dir+"cc_Ip_Uc_d1_"+str(pre)+"_"+str(cur)+".gz","w") out.write("# 1: i = index of curent round\n") out.write("# 2: total nb nodes since beginning\n") out.write("# 3: total nb links since beginning\n") out.write("# 4: nb nodes in round i\n") out.write("# 5: nb links in round i\n") out.write("#\n") out.write("# 6: number of nodes in the union (Up) pre=%d previous rounds (from i-pre-1 to i-1)\n"%pre) out.write("# 7: ---------------------------- (Uc) cur=%d current ------ (from i to i+cur-1)\n"%cur) out.write("# 8: ---------------------- intersection (Ip) pre=%d previous rounds (from i-pre-1 to i-1)\n"%pre) out.write("# 9: ----------------------------------- (Ic) cur=%d current ------ (from i to i+cur-1)\n"%cur) out.write("#\n") out.write("#10: number of appearing nodes (ie in Uc and not in Up)\n") out.write("#11: number of disappearing nodes (ie in Up and not in Uc)\n") out.write("#12: number of appearing stable nodes (ie in Ic and not in Up)\n") out.write("#13: number of disappearing stable nodes (ie in Ip and not in Uc)\n") out.write("#\n") out.write("#14-21: same as 6-13 but with links\n") out.write("#\n") out.write("#22-: border stats\n") out.write("#\n") out.flush() out_dist.write("# each line: step number, number of appearing linkes, distances between extremities of one of them\n") out_dist.flush() for f in [out_cc_Up_Uc,out_cc_Uc_Up,out_cc_Ic_Up,out_cc_Ip_Uc,out_cc_Up_Uc_d1,out_cc_Uc_Up_d1,out_cc_Ic_Up_d1,out_cc_Ip_Uc_d1]: f.write("# 1: i = index of curent round\n") f.write("# 2: number of connected components\n") f.write("# 3: size of the largest one\n") f.write("# 4-...: size of each connected component\n") f.flush() out_file[pre,cur] = out out_dist_file[pre,cur] = out_dist out_cc_file_Up_Uc[pre,cur] = out_cc_Up_Uc out_cc_file_Uc_Up[pre,cur] = out_cc_Uc_Up out_cc_file_Ic_Up[pre,cur] = out_cc_Ic_Up out_cc_file_Ip_Uc[pre,cur] = out_cc_Ip_Uc out_cc_file_Up_Uc_d1[pre,cur] = out_cc_Up_Uc_d1 out_cc_file_Uc_Up_d1[pre,cur] = out_cc_Uc_Up_d1 out_cc_file_Ic_Up_d1[pre,cur] = out_cc_Ic_Up_d1 out_cc_file_Ip_Uc_d1[pre,cur] = out_cc_Ip_Uc_d1 step = begin i = 0 while step < end: sys.stderr.write(str(step)+" ") sys.stderr.flush() t = read_ip_tree(input_dir+"%d.out.gz"%step) nodes[i%(max_cur+max_pre)] = set(t.keys()) links[i%(max_cur+max_pre)] = set() directed_links[i%(max_cur+max_pre)] = set() for x in nodes[i%(max_cur+max_pre)]: if t[x] in nodes[i%(max_cur+max_pre)]: links[i%(max_cur+max_pre)].add((x,t[x])) links[i%(max_cur+max_pre)].add((t[x],x)) directed_links[i%(max_cur+max_pre)].add((t[x],x)) all_nodes.update(nodes[(i-max_cur+1)%(max_cur+max_pre)]) all_links.update(links[(i-max_cur+1)%(max_cur+max_pre)]) # compute the unions and intersections for each _current_ slice size k tmp_union_nodes = set() tmp_inter_nodes = nodes[(i-max_cur+1)%(max_cur+max_pre)].copy() tmp_union_links = set() tmp_inter_links = links[(i-max_cur+1)%(max_cur+max_pre)].copy() tmp_union_directed_links = set() tmp_inter_directed_links = directed_links[(i-max_cur+1)%(max_cur+max_pre)].copy() union_cur_nodes = {} inter_cur_nodes = {} union_cur_links = {} inter_cur_links = {} union_cur_directed_links = {} inter_cur_directed_links = {} for k in xrange(max_cur): tmp_union_nodes.update(nodes[(i-max_cur+k+1)%(max_cur+max_pre)]) tmp_inter_nodes.intersection_update(nodes[(i-max_cur+k+1)%(max_cur+max_pre)]) tmp_union_links.update(links[(i-max_cur+k+1)%(max_cur+max_pre)]) tmp_inter_links.intersection_update(links[(i-max_cur+k+1)%(max_cur+max_pre)]) tmp_union_directed_links.update(directed_links[(i-max_cur+k+1)%(max_cur+max_pre)]) tmp_inter_directed_links.intersection_update(directed_links[(i-max_cur+k+1)%(max_cur+max_pre)]) if k+1 in cur_set: union_cur_nodes[k+1] = tmp_union_nodes.copy() inter_cur_nodes[k+1] = tmp_inter_nodes.copy() union_cur_links[k+1] = tmp_union_links.copy() inter_cur_links[k+1] = tmp_inter_links.copy() union_cur_directed_links[k+1] = tmp_union_directed_links.copy() inter_cur_directed_links[k+1] = tmp_inter_directed_links.copy() # compute the unions and intersections for each _previous_ slice size k tmp_union_nodes = set() tmp_inter_nodes = nodes[(i-max_cur)%(max_cur+max_pre)].copy() tmp_union_links = set() tmp_inter_links = links[(i-max_cur)%(max_cur+max_pre)].copy() tmp_union_directed_links = set() tmp_inter_directed_links = directed_links[(i-max_cur)%(max_cur+max_pre)].copy() union_pre_nodes = {} inter_pre_nodes = {} union_pre_links = {} inter_pre_links = {} union_pre_directed_links = {} inter_pre_directed_links = {} for k in xrange(max_pre): tmp_union_nodes.update(nodes[(i-max_cur-k)%(max_cur+max_pre)]) tmp_inter_nodes.intersection_update(nodes[(i-max_cur-k)%(max_cur+max_pre)]) tmp_union_links.update(links[(i-max_cur-k)%(max_cur+max_pre)]) tmp_inter_links.intersection_update(links[(i-max_cur-k)%(max_cur+max_pre)]) tmp_union_directed_links.update(directed_links[(i-max_cur-k)%(max_cur+max_pre)]) tmp_inter_directed_links.intersection_update(directed_links[(i-max_cur-k)%(max_cur+max_pre)]) if k+1 in pre_set: union_pre_nodes[k+1] = tmp_union_nodes.copy() inter_pre_nodes[k+1] = tmp_inter_nodes.copy() union_pre_links[k+1] = tmp_union_links.copy() inter_pre_links[k+1] = tmp_inter_links.copy() union_pre_directed_links[k+1] = tmp_union_directed_links.copy() inter_pre_directed_links[k+1] = tmp_inter_directed_links.copy() # compute the statistics for each given interval for (pre,cur) in intervals: out = out_file[pre,cur] out_dist = out_dist_file[pre,cur] # basics out.write("%d %d %d %d %d"%(step-max_cur+1,len(all_nodes),len(all_links)/2,len(nodes[(i-max_cur+1)%(max_cur+max_pre)]),len(links[(i-max_cur+1)%(max_cur+max_pre)])/2)) # node dynamics out.write(" %d %d %d %d"%(len(union_pre_nodes[pre]),len(union_cur_nodes[cur]),len(inter_pre_nodes[pre]),len(inter_cur_nodes[cur]))) appearing_nodes = union_cur_nodes[cur].difference(union_pre_nodes[pre]) disappearing_nodes = union_pre_nodes[pre].difference(union_cur_nodes[cur]) appearing_stable_nodes = inter_cur_nodes[cur].difference(union_pre_nodes[pre]) disappearing_stable_nodes = inter_pre_nodes[pre].difference(union_cur_nodes[cur]) out.write(" %d %d %d %d"%(len(appearing_nodes),len(disappearing_nodes),len(appearing_stable_nodes),len(disappearing_stable_nodes))) # link dynamics out.write(" %d %d %d %d"%(len(union_pre_links[pre]),len(union_cur_links[cur]),len(inter_pre_links[pre]),len(inter_cur_links[cur]))) appearing_links = union_cur_links[cur].difference(union_pre_links[pre]) disappearing_links = union_pre_links[pre].difference(union_cur_links[cur]) appearing_stable_links = inter_cur_links[cur].difference(union_pre_links[pre]) disappearing_stable_links = inter_pre_links[pre].difference(union_cur_links[cur]) out.write(" %d %d %d %d"%(len(appearing_links),len(disappearing_links),len(appearing_stable_links),len(disappearing_stable_links))) directed_appearing_links = union_cur_directed_links[cur].difference(union_pre_directed_links[pre]) directed_disappearing_links = union_pre_directed_links[pre].difference(union_cur_directed_links[cur]) # border nodes (/links ?) for (tmp_links,tmp_nodes) in [(appearing_links,appearing_nodes), (disappearing_links,disappearing_nodes), (directed_appearing_links,appearing_nodes), (directed_disappearing_links,disappearing_nodes)]: out_border_nodes = set() in_border_nodes = set() for (x,y) in tmp_links: if x not in tmp_nodes and y in tmp_nodes: out_border_nodes.add(x) in_border_nodes.add(y) out.write(" %d %d"%(len(out_border_nodes),len(in_border_nodes))) out.write("\n") out.flush() # distance between extremities of appearing links g = make_subgraph(union_pre_nodes[pre],union_pre_links[pre]) for (u,v) in directed_appearing_links: out_dist.write("%d %d %d\n"%(step,len(directed_appearing_links),distance(g,u,v))) out_dist.flush() # connected components for Up_Uc for file,sub in [(out_cc_file_Up_Uc,make_subgraph), (out_cc_file_Up_Uc_d1,make_subgraph_d1)]: output_cc(file[pre,cur], step-max_cur+1, connected_components(sub(disappearing_nodes,disappearing_links))) # connected components for Uc_Up for file,sub in [(out_cc_file_Uc_Up,make_subgraph), (out_cc_file_Uc_Up_d1,make_subgraph_d1)]: output_cc(file[pre,cur], step-max_cur+1, connected_components(sub(appearing_nodes,appearing_links))) # connected components for Ic_Up for file,sub in [(out_cc_file_Ic_Up,make_subgraph), (out_cc_file_Ic_Up_d1,make_subgraph_d1)]: output_cc(file[pre,cur], step-max_cur+1, connected_components(sub(appearing_stable_nodes,appearing_links))) # connected components for Ip_Uc for file,sub in [(out_cc_file_Ip_Uc,make_subgraph), (out_cc_file_Ip_Uc_d1,make_subgraph_d1)]: output_cc(file[pre,cur], step-max_cur+1, connected_components(sub(disappearing_stable_nodes,disappearing_links))) i += 1 sys.stdout.write("%d "%i) sys.stdout.flush() step += 1 sys.stderr.write("\n") sys.stderr.flush()