import sys, gzip, Queue def int_of_ip(ip): r = 0 for x in ip.split('.'): r *= 256 r += int(x) return(r) def read_tree(filename): f = gzip.open(filename) t = {} for l in f: [son,father] = l.split() t[son] = father return(t) def read_ip_tree(filename): f = gzip.open(filename) t = {} t2 = {} for l in f: [son,father] = l.split() if son[0].isdigit() and father[0].isdigit(): t[int_of_ip(son)] = int_of_ip(father) t2[int_of_ip(son)] = '' t2[int_of_ip(father)] = '' return(t,t2) def connected_components(g): visited = {} for v in g.iterkeys(): visited[v] = 0 n_cc = 0 q = Queue.Queue() for v in g.iterkeys(): if visited[v] == 0: n_cc += 1 q.put(v) visited[v] = n_cc while not q.empty(): v = q.get() for u in g[v]: if visited[u] == 0: visited[u] = n_cc q.put(u) cc_size_nodes = [0 for i in xrange(n_cc)] for v in g.iterkeys(): cc_size_nodes[visited[v]-1] += 1 cc_size_links = [0 for i in xrange(n_cc)] for v in g.iterkeys(): for u in g[v]: cc_size_links[visited[v]-1] += 1 for i in xrange(n_cc): cc_size_links[i] /= 2 if n_cc>0: size_largest_cc = max(cc_size_nodes) else: size_largest_cc = None return(n_cc,size_largest_cc,cc_size_nodes,cc_size_links) def distance(g,x,y): res = {} if x not in g or y not in g: return(-2) for v in g.iterkeys(): res[v] = -1 q = Queue.Queue() q.put(x) res[x] = 0 while not q.empty(): v = q.get() for u in g[v]: if res[u] == -1: res[u] = res[v] + 1 if u == y: return(res[u]) q.put(u) return(res[y]) def all_distance(g,x): res = {} if x not in g: return(-2) for v in g.iterkeys(): res[v] = -1 q = Queue.Queue() q.put(x) res[x] = 0 while not q.empty(): v = q.get() for u in g[v]: if res[u] == -1: res[u] = res[v] + 1 return(res) def output_cc(f,i,(n_cc,size_largest_cc,cc_size_nodes,cc_size_links)): for x in xrange(n_cc): n = cc_size_nodes[x] m = cc_size_links[x] avg_deg = 0 if n>0: avg_deg = (2.*m)/n density = 0 if n>1: density = (2.*m)/(n*(n-1.)) f.write("%d %d %d %d %d %f %f\n"%(i,n_cc,size_largest_cc,n,m,avg_deg,density)) f.flush() def make_subgraph(nodes,links): g = {} for x in nodes: g[x] = set() for x,y in links: if x in nodes and y in nodes: g[x].add(y) g[y].add(x) return(g) def make_subgraph_d1(nodes,links): g = {} for x in nodes: g[x] = set() for x,y in links: if x in nodes or y in nodes: g[x] = set() g[y] = set() for x,y in links: if x in nodes or y in nodes: g[x].add(y) g[y].add(x) return(g) if __name__ == '__main__': if len(sys.argv) != 5: sys.stderr.write("Usage: %s input_dir begin end out_dir\n"%sys.argv[0]) sys.exit(1) input_dir = sys.argv[1]+"/" begin = int(sys.argv[2]) end = int(sys.argv[3]) out_dir = sys.argv[4]+'/' intervals = set() l = [1,2,3,5,10,50,100] for pre in l: for cur in l: intervals.add((pre,cur)) intervals = set() intervals.add((5,1)) intervals.add((10,1)) intervals.add((50,1)) intervals.add((5,2)) intervals.add((10,2)) intervals.add((50,2)) intervals.add((5,5)) intervals.add((10,5)) intervals.add((50,5)) pre_set = set([x for (x,y) in intervals]) cur_set = set([y for (x,y) in intervals]) max_pre = max(pre_set) max_cur = max(cur_set) sys.stderr.write("max_pre = %d max_cur = %d\n"%(max_pre,max_cur)) sys.stderr.flush() # we will keep max_pre+max_cur steps in memory nodes = [set() for i in xrange(max_pre+max_cur)] links = [set() for i in xrange(max_pre+max_cur)] directed_links = [set() for i in xrange(max_pre+max_cur)] all_nodes = set() all_links = set() # we will create a bunch files for each (x,y) in intervals out_file = {} out_dist_file = {} out_cc_file_Up_Uc = {} out_cc_file_Uc_Up = {} out_cc_file_Ic_Up = {} out_cc_file_Ip_Uc = {} out_cc_file_Up_Uc_d1 = {} out_cc_file_Uc_Up_d1 = {} out_cc_file_Ic_Up_d1 = {} out_cc_file_Ip_Uc_d1 = {} for (pre,cur) in intervals: out = gzip.open(out_dir+"out_"+str(pre)+"_"+str(cur)+".gz","w") out_dist = gzip.open(out_dir+"out_dist_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Up_Uc = gzip.open(out_dir+"cc_Up_Uc_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Uc_Up = gzip.open(out_dir+"cc_Uc_Up_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Ic_Up = gzip.open(out_dir+"cc_Ic_Up_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Ip_Uc = gzip.open(out_dir+"cc_Ip_Uc_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Up_Uc_d1 = gzip.open(out_dir+"cc_Up_Uc_d1_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Uc_Up_d1 = gzip.open(out_dir+"cc_Uc_Up_d1_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Ic_Up_d1 = gzip.open(out_dir+"cc_Ic_Up_d1_"+str(pre)+"_"+str(cur)+".gz","w") out_cc_Ip_Uc_d1 = gzip.open(out_dir+"cc_Ip_Uc_d1_"+str(pre)+"_"+str(cur)+".gz","w") out.write("# 1: i = index of curent round\n") out.write("# 2: total nb nodes since beginning\n") out.write("# 3: total nb links since beginning\n") out.write("# 4: nb nodes in round i\n") out.write("# 5: nb links in round i\n") out.write("#\n") out.write("# 6: number of nodes in the union (Up) pre=%d previous rounds (from i-pre-1 to i-1)\n"%pre) out.write("# 7: ---------------------------- (Uc) cur=%d current ------ (from i to i+cur-1)\n"%cur) out.write("# 8: ---------------------- intersection (Ip) pre=%d previous rounds (from i-pre-1 to i-1)\n"%pre) out.write("# 9: ----------------------------------- (Ic) cur=%d current ------ (from i to i+cur-1)\n"%cur) out.write("#\n") out.write("#10: number of appearing nodes (ie in Uc and not in Up)\n") out.write("#11: number of disappearing nodes (ie in Up and not in Uc)\n") out.write("#12: number of appearing stable nodes (ie in Ic and not in Up)\n") out.write("#13: number of disappearing stable nodes (ie in Ip and not in Uc)\n") out.write("#\n") out.write("#14-21: same as 6-13 but with links\n") out.write("#\n") out.write("#22-: border stats\n") out.write("#\n") out.flush() out_dist.write("# each line: step number, number of appearing linkes, distances between extremities of one of them, number of nodes that have changed distance from its first end,number of nodes that have changed distance from its second end n\n") out_dist.flush() for f in [out_cc_Up_Uc,out_cc_Uc_Up,out_cc_Ic_Up,out_cc_Ip_Uc,out_cc_Up_Uc_d1,out_cc_Uc_Up_d1,out_cc_Ic_Up_d1,out_cc_Ip_Uc_d1]: f.write("# 1: i = index of curent round\n") f.write("# 2: number of connected components\n") f.write("# 3: size of the largest one\n") f.write("# 4-...: size of each connected component\n") f.flush() out_file[pre,cur] = out out_dist_file[pre,cur] = out_dist out_cc_file_Up_Uc[pre,cur] = out_cc_Up_Uc out_cc_file_Uc_Up[pre,cur] = out_cc_Uc_Up out_cc_file_Ic_Up[pre,cur] = out_cc_Ic_Up out_cc_file_Ip_Uc[pre,cur] = out_cc_Ip_Uc out_cc_file_Up_Uc_d1[pre,cur] = out_cc_Up_Uc_d1 out_cc_file_Uc_Up_d1[pre,cur] = out_cc_Uc_Up_d1 out_cc_file_Ic_Up_d1[pre,cur] = out_cc_Ic_Up_d1 out_cc_file_Ip_Uc_d1[pre,cur] = out_cc_Ip_Uc_d1 step = begin i = 0 while step < end: sys.stderr.write(str(step)+" ") sys.stderr.flush() (t,t2) = read_ip_tree(input_dir+"%d.out.gz"%step) nb_nodes = len(t2) nodes[i%(max_cur+max_pre)] = set(t.keys()) links[i%(max_cur+max_pre)] = set() directed_links[i%(max_cur+max_pre)] = set() for x in nodes[i%(max_cur+max_pre)]: if t[x] in nodes[i%(max_cur+max_pre)]: links[i%(max_cur+max_pre)].add((x,t[x])) links[i%(max_cur+max_pre)].add((t[x],x)) directed_links[i%(max_cur+max_pre)].add((t[x],x)) all_nodes.update(nodes[(i-max_cur+1)%(max_cur+max_pre)]) all_links.update(links[(i-max_cur+1)%(max_cur+max_pre)]) # compute the unions and intersections for each _current_ slice size k tmp_union_nodes = set() tmp_inter_nodes = nodes[(i-max_cur+1)%(max_cur+max_pre)].copy() tmp_union_links = set() tmp_inter_links = links[(i-max_cur+1)%(max_cur+max_pre)].copy() tmp_union_directed_links = set() tmp_inter_directed_links = directed_links[(i-max_cur+1)%(max_cur+max_pre)].copy() union_cur_nodes = {} inter_cur_nodes = {} union_cur_links = {} inter_cur_links = {} union_cur_directed_links = {} inter_cur_directed_links = {} for k in xrange(max_cur): tmp_union_nodes.update(nodes[(i-max_cur+k+1)%(max_cur+max_pre)]) tmp_inter_nodes.intersection_update(nodes[(i-max_cur+k+1)%(max_cur+max_pre)]) tmp_union_links.update(links[(i-max_cur+k+1)%(max_cur+max_pre)]) tmp_inter_links.intersection_update(links[(i-max_cur+k+1)%(max_cur+max_pre)]) tmp_union_directed_links.update(directed_links[(i-max_cur+k+1)%(max_cur+max_pre)]) tmp_inter_directed_links.intersection_update(directed_links[(i-max_cur+k+1)%(max_cur+max_pre)]) if k+1 in cur_set: union_cur_nodes[k+1] = tmp_union_nodes.copy() inter_cur_nodes[k+1] = tmp_inter_nodes.copy() union_cur_links[k+1] = tmp_union_links.copy() inter_cur_links[k+1] = tmp_inter_links.copy() union_cur_directed_links[k+1] = tmp_union_directed_links.copy() inter_cur_directed_links[k+1] = tmp_inter_directed_links.copy() # compute the unions and intersections for each _previous_ slice size k tmp_union_nodes = set() tmp_inter_nodes = nodes[(i-max_cur)%(max_cur+max_pre)].copy() tmp_union_links = set() tmp_inter_links = links[(i-max_cur)%(max_cur+max_pre)].copy() tmp_union_directed_links = set() tmp_inter_directed_links = directed_links[(i-max_cur)%(max_cur+max_pre)].copy() union_pre_nodes = {} inter_pre_nodes = {} union_pre_links = {} inter_pre_links = {} union_pre_directed_links = {} inter_pre_directed_links = {} for k in xrange(max_pre): tmp_union_nodes.update(nodes[(i-max_cur-k)%(max_cur+max_pre)]) tmp_inter_nodes.intersection_update(nodes[(i-max_cur-k)%(max_cur+max_pre)]) tmp_union_links.update(links[(i-max_cur-k)%(max_cur+max_pre)]) tmp_inter_links.intersection_update(links[(i-max_cur-k)%(max_cur+max_pre)]) tmp_union_directed_links.update(directed_links[(i-max_cur-k)%(max_cur+max_pre)]) tmp_inter_directed_links.intersection_update(directed_links[(i-max_cur-k)%(max_cur+max_pre)]) if k+1 in pre_set: union_pre_nodes[k+1] = tmp_union_nodes.copy() inter_pre_nodes[k+1] = tmp_inter_nodes.copy() union_pre_links[k+1] = tmp_union_links.copy() inter_pre_links[k+1] = tmp_inter_links.copy() union_pre_directed_links[k+1] = tmp_union_directed_links.copy() inter_pre_directed_links[k+1] = tmp_inter_directed_links.copy() # compute the statistics for each given interval for (pre,cur) in intervals: out = out_file[pre,cur] out_dist = out_dist_file[pre,cur] # basics out.write("%d %d %d %d %d %d"%(step-max_cur+1,len(all_nodes),nb_nodes,len(all_links)/2,len(nodes[(i-max_cur+1)%(max_cur+max_pre)]),len(links[(i-max_cur+1)%(max_cur+max_pre)])/2)) # node dynamics out.write(" %d %d %d %d"%(len(union_pre_nodes[pre]),len(union_cur_nodes[cur]),len(inter_pre_nodes[pre]),len(inter_cur_nodes[cur]))) appearing_nodes = union_cur_nodes[cur].difference(union_pre_nodes[pre]) disappearing_nodes = union_pre_nodes[pre].difference(union_cur_nodes[cur]) appearing_stable_nodes = inter_cur_nodes[cur].difference(union_pre_nodes[pre]) disappearing_stable_nodes = inter_pre_nodes[pre].difference(union_cur_nodes[cur]) out.write(" %d %d %d %d"%(len(appearing_nodes),len(disappearing_nodes),len(appearing_stable_nodes),len(disappearing_stable_nodes))) # link dynamics out.write(" %d %d %d %d"%(len(union_pre_links[pre]),len(union_cur_links[cur]),len(inter_pre_links[pre]),len(inter_cur_links[cur]))) appearing_links = union_cur_links[cur].difference(union_pre_links[pre]) disappearing_links = union_pre_links[pre].difference(union_cur_links[cur]) appearing_stable_links = inter_cur_links[cur].difference(union_pre_links[pre]) disappearing_stable_links = inter_pre_links[pre].difference(union_cur_links[cur]) out.write(" %d %d %d %d"%(len(appearing_links),len(disappearing_links),len(appearing_stable_links),len(disappearing_stable_links))) directed_appearing_links = union_cur_directed_links[cur].difference(union_pre_directed_links[pre]) directed_disappearing_links = union_pre_directed_links[pre].difference(union_cur_directed_links[cur]) # border nodes (/links ?) for (tmp_links,tmp_nodes) in [(appearing_links,appearing_nodes), (disappearing_links,disappearing_nodes), (directed_appearing_links,appearing_nodes), (directed_disappearing_links,disappearing_nodes)]: out_border_nodes = set() in_border_nodes = set() for (x,y) in tmp_links: if x not in tmp_nodes and y in tmp_nodes: out_border_nodes.add(x) in_border_nodes.add(y) out.write(" %d %d"%(len(out_border_nodes),len(in_border_nodes))) out.write("\n") out.flush() # distance between extremities of appearing links g = make_subgraph(union_pre_nodes[pre],union_pre_links[pre]) g1 = make_subgraph(union_cur_nodes[cur],union_cur_links[cur]) #print("\n * "+str(len(directed_appearing_links))) for (u,v) in directed_appearing_links: res1 = all_distance(g,u) if type(res1) == int : continue res11 = all_distance(g,v) if type(res11) == int : continue g[u].add(v) g[v].add(u) res2 = all_distance(g1,u) if type(res2) == int : continue res22 = all_distance(g1,v) if type(res22) == int : continue g[u].remove(v) g[v].remove(u) nb_nodes_changed_u = 0 nb_nodes_changed_v = 0 #for n in res1.iterkeys(): # if res2.has_key(n) and res1[n] != res2[n]: # nb_nodes_changed_u += 1 #for n in res11.iterkeys(): # if res22.has_key(n) and res11[n] != res22[n]: # nb_nodes_changed_v += 1 #out_dist.write("%d %d %d %d %d\n"%(step,len(directed_appearing_links),distance(g,u,v), nb_nodes_changed_u, nb_nodes_changed_v)) #sys.exit() out_dist.flush() # connected components for Up_Uc #for file,sub in [(out_cc_file_Up_Uc,make_subgraph), (out_cc_file_Up_Uc_d1,make_subgraph_d1)]: # output_cc(file[pre,cur], step-max_cur+1, connected_components(sub(disappearing_nodes,disappearing_links))) # connected components for Uc_Up #for file,sub in [(out_cc_file_Uc_Up,make_subgraph), (out_cc_file_Uc_Up_d1,make_subgraph_d1)]: # output_cc(file[pre,cur], step-max_cur+1, connected_components(sub(appearing_nodes,appearing_links))) # connected components for Ic_Up #for file,sub in [(out_cc_file_Ic_Up,make_subgraph), (out_cc_file_Ic_Up_d1,make_subgraph_d1)]: # output_cc(file[pre,cur], step-max_cur+1, connected_components(sub(appearing_stable_nodes,appearing_links))) # connected components for Ip_Uc for file,sub in [(out_cc_file_Ip_Uc,make_subgraph), (out_cc_file_Ip_Uc_d1,make_subgraph_d1)]: output_cc(file[pre,cur], step-max_cur+1, connected_components(sub(disappearing_stable_nodes,disappearing_links))) i += 1 sys.stdout.write("%d "%i) sys.stdout.flush() step += 1 sys.stderr.write("\n") sys.stderr.flush()