Spaces:
Sleeping
Sleeping
File size: 12,267 Bytes
519c06d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 |
import networkx as nx
import json
from datetime import datetime
from typing import List, Dict, Any
import matplotlib.pyplot as plt
from collections import defaultdict
class CitationNetworkAnalyzer:
"""Analyze citation networks and author collaborations - Web App Version"""
def __init__(self):
self.reset()
print("✅ Citation network analyzer initialized (web app version)!")
def reset(self):
"""Reset all data structures"""
self.citation_graph = nx.DiGraph()
self.author_graph = nx.Graph()
self.paper_data = {}
self.author_data = {}
print("🔄 Citation network analyzer reset")
def _safe_get_authors(self, paper: Dict) -> List[str]:
"""Safely extract and normalize author list from paper"""
authors = paper.get('authors', [])
# Handle None
if authors is None:
return []
# Handle string (comma-separated)
if isinstance(authors, str):
if not authors.strip():
return []
return [a.strip() for a in authors.split(',') if a.strip()]
# Handle list
if isinstance(authors, list):
result = []
for author in authors:
if isinstance(author, str) and author.strip():
result.append(author.strip())
elif isinstance(author, dict):
# Handle author objects with 'name' field
name = author.get('name', '') or author.get('authorId', '')
if name and isinstance(name, str):
result.append(name.strip())
return result
# Unknown format
return []
def _safe_add_author(self, author_name: str, paper_id: str, citation_count: int = 0):
"""Safely add author to the graph"""
try:
# Initialize author data if not exists
if author_name not in self.author_data:
self.author_data[author_name] = {
'papers': [],
'total_citations': 0
}
# Add to NetworkX graph if not exists
if not self.author_graph.has_node(author_name):
self.author_graph.add_node(author_name)
# Update author data
if paper_id not in self.author_data[author_name]['papers']:
self.author_data[author_name]['papers'].append(paper_id)
self.author_data[author_name]['total_citations'] += citation_count
return True
except Exception as e:
print(f"⚠️ Error adding author {author_name}: {e}")
return False
def _safe_add_collaboration(self, author1: str, author2: str, paper_id: str):
"""Safely add collaboration edge between authors"""
try:
# Ensure both authors exist
if not self.author_graph.has_node(author1):
self.author_graph.add_node(author1)
if not self.author_graph.has_node(author2):
self.author_graph.add_node(author2)
# Add or update edge
if self.author_graph.has_edge(author1, author2):
# Update existing edge
edge_data = self.author_graph.edges[author1, author2]
edge_data['weight'] = edge_data.get('weight', 0) + 1
if 'papers' not in edge_data:
edge_data['papers'] = []
if paper_id not in edge_data['papers']:
edge_data['papers'].append(paper_id)
else:
# Add new edge
self.author_graph.add_edge(author1, author2, weight=1, papers=[paper_id])
return True
except Exception as e:
print(f"⚠️ Error adding collaboration {author1}-{author2}: {e}")
return False
def add_papers(self, papers: List[Dict]):
"""Add papers to the citation network"""
if not papers:
print("⚠️ No papers provided to add_papers")
return
processed_count = 0
error_count = 0
print(f"📝 Processing {len(papers)} papers...")
for paper_idx, paper in enumerate(papers):
try:
# Validate paper input
if not isinstance(paper, dict):
print(f"⚠️ Paper {paper_idx} is not a dict: {type(paper)}")
error_count += 1
continue
# Generate paper ID
paper_id = paper.get('paper_id')
if not paper_id:
paper_id = paper.get('url', '')
if not paper_id:
title = paper.get('title', f'Unknown_{paper_idx}')
paper_id = f"paper_{abs(hash(title)) % 1000000}"
# Store paper data
self.paper_data[paper_id] = {
'title': paper.get('title', ''),
'authors': self._safe_get_authors(paper),
'year': paper.get('year'),
'venue': paper.get('venue', ''),
'citation_count': paper.get('citation_count', 0),
'source': paper.get('source', ''),
'url': paper.get('url', ''),
'abstract': paper.get('abstract', '')
}
# Add to citation graph
self.citation_graph.add_node(paper_id, **self.paper_data[paper_id])
# Process authors
authors = self._safe_get_authors(paper)
citation_count = paper.get('citation_count', 0)
# Validate citation count
if not isinstance(citation_count, (int, float)):
citation_count = 0
# Add authors
valid_authors = []
for author in authors:
if self._safe_add_author(author, paper_id, citation_count):
valid_authors.append(author)
# Add collaborations
for i, author1 in enumerate(valid_authors):
for j, author2 in enumerate(valid_authors):
if i < j: # Avoid duplicates and self-loops
self._safe_add_collaboration(author1, author2, paper_id)
processed_count += 1
except Exception as e:
print(f"⚠️ Error processing paper {paper_idx}: {e}")
error_count += 1
continue
print(f"✅ Successfully processed {processed_count} papers ({error_count} errors)")
def analyze_author_network(self) -> Dict:
"""Analyze author collaboration network"""
try:
if len(self.author_graph.nodes) == 0:
return {'error': 'No authors in network'}
# Basic network metrics
metrics = {
'total_authors': len(self.author_graph.nodes),
'total_collaborations': len(self.author_graph.edges),
'network_density': nx.density(self.author_graph),
'number_of_components': nx.number_connected_components(self.author_graph),
'largest_component_size': len(max(nx.connected_components(self.author_graph), key=len)) if nx.number_connected_components(self.author_graph) > 0 else 0
}
# Most collaborative authors
collaboration_counts = {node: self.author_graph.degree(node) for node in self.author_graph.nodes}
top_collaborators = sorted(collaboration_counts.items(), key=lambda x: x[1], reverse=True)[:10]
# Most productive authors
productivity = {}
for author, data in self.author_data.items():
productivity[author] = len(data.get('papers', []))
top_productive = sorted(productivity.items(), key=lambda x: x[1], reverse=True)[:10]
# Most cited authors
citation_counts = {}
for author, data in self.author_data.items():
citation_counts[author] = data.get('total_citations', 0)
top_cited = sorted(citation_counts.items(), key=lambda x: x[1], reverse=True)[:10]
return {
'network_metrics': metrics,
'top_collaborators': top_collaborators,
'top_productive_authors': top_productive,
'top_cited_authors': top_cited,
'analysis_timestamp': datetime.now().isoformat()
}
except Exception as e:
return {
'error': str(e),
'analysis_timestamp': datetime.now().isoformat()
}
def analyze_paper_network(self) -> Dict:
"""Analyze paper citation network"""
try:
if len(self.citation_graph.nodes) == 0:
return {'error': 'No papers in network'}
# Basic network metrics
metrics = {
'total_papers': len(self.citation_graph.nodes),
'total_citations': len(self.citation_graph.edges),
'network_density': nx.density(self.citation_graph),
'number_of_components': nx.number_weakly_connected_components(self.citation_graph),
'largest_component_size': len(max(nx.weakly_connected_components(self.citation_graph), key=len)) if nx.number_weakly_connected_components(self.citation_graph) > 0 else 0
}
# Most cited papers
in_degree = dict(self.citation_graph.in_degree())
most_cited = sorted(in_degree.items(), key=lambda x: x[1], reverse=True)[:10]
# Most citing papers
out_degree = dict(self.citation_graph.out_degree())
most_citing = sorted(out_degree.items(), key=lambda x: x[1], reverse=True)[:10]
# Convert paper IDs to titles for readability
most_cited_titles = []
for paper_id, count in most_cited:
if paper_id in self.paper_data:
most_cited_titles.append((self.paper_data[paper_id]['title'], count))
else:
most_cited_titles.append((paper_id, count))
most_citing_titles = []
for paper_id, count in most_citing:
if paper_id in self.paper_data:
most_citing_titles.append((self.paper_data[paper_id]['title'], count))
else:
most_citing_titles.append((paper_id, count))
return {
'network_metrics': metrics,
'most_cited_papers': most_cited_titles,
'most_citing_papers': most_citing_titles,
'analysis_timestamp': datetime.now().isoformat()
}
except Exception as e:
return {
'error': str(e),
'analysis_timestamp': datetime.now().isoformat()
}
def get_network_summary(self) -> Dict:
"""Get comprehensive network summary"""
try:
author_analysis = self.analyze_author_network()
paper_analysis = self.analyze_paper_network()
return {
'author_network': author_analysis,
'paper_network': paper_analysis,
'overall_stats': {
'total_papers': len(self.paper_data),
'total_authors': len(self.author_data),
'papers_per_author': len(self.paper_data) / max(len(self.author_data), 1),
'collaborations_per_author': len(self.author_graph.edges) / max(len(self.author_graph.nodes), 1)
},
'analysis_timestamp': datetime.now().isoformat()
}
except Exception as e:
return {
'error': str(e),
'analysis_timestamp': datetime.now().isoformat()
} |