Spaces:
Sleeping
Sleeping
import networkx as nx | |
import json | |
from datetime import datetime | |
from typing import List, Dict, Any | |
import matplotlib.pyplot as plt | |
from collections import defaultdict | |
class CitationNetworkAnalyzer: | |
"""Analyze citation networks and author collaborations - Web App Version""" | |
def __init__(self): | |
self.reset() | |
print("✅ Citation network analyzer initialized (web app version)!") | |
def reset(self): | |
"""Reset all data structures""" | |
self.citation_graph = nx.DiGraph() | |
self.author_graph = nx.Graph() | |
self.paper_data = {} | |
self.author_data = {} | |
print("🔄 Citation network analyzer reset") | |
def _safe_get_authors(self, paper: Dict) -> List[str]: | |
"""Safely extract and normalize author list from paper""" | |
authors = paper.get('authors', []) | |
# Handle None | |
if authors is None: | |
return [] | |
# Handle string (comma-separated) | |
if isinstance(authors, str): | |
if not authors.strip(): | |
return [] | |
return [a.strip() for a in authors.split(',') if a.strip()] | |
# Handle list | |
if isinstance(authors, list): | |
result = [] | |
for author in authors: | |
if isinstance(author, str) and author.strip(): | |
result.append(author.strip()) | |
elif isinstance(author, dict): | |
# Handle author objects with 'name' field | |
name = author.get('name', '') or author.get('authorId', '') | |
if name and isinstance(name, str): | |
result.append(name.strip()) | |
return result | |
# Unknown format | |
return [] | |
def _safe_add_author(self, author_name: str, paper_id: str, citation_count: int = 0): | |
"""Safely add author to the graph""" | |
try: | |
# Initialize author data if not exists | |
if author_name not in self.author_data: | |
self.author_data[author_name] = { | |
'papers': [], | |
'total_citations': 0 | |
} | |
# Add to NetworkX graph if not exists | |
if not self.author_graph.has_node(author_name): | |
self.author_graph.add_node(author_name) | |
# Update author data | |
if paper_id not in self.author_data[author_name]['papers']: | |
self.author_data[author_name]['papers'].append(paper_id) | |
self.author_data[author_name]['total_citations'] += citation_count | |
return True | |
except Exception as e: | |
print(f"⚠️ Error adding author {author_name}: {e}") | |
return False | |
def _safe_add_collaboration(self, author1: str, author2: str, paper_id: str): | |
"""Safely add collaboration edge between authors""" | |
try: | |
# Ensure both authors exist | |
if not self.author_graph.has_node(author1): | |
self.author_graph.add_node(author1) | |
if not self.author_graph.has_node(author2): | |
self.author_graph.add_node(author2) | |
# Add or update edge | |
if self.author_graph.has_edge(author1, author2): | |
# Update existing edge | |
edge_data = self.author_graph.edges[author1, author2] | |
edge_data['weight'] = edge_data.get('weight', 0) + 1 | |
if 'papers' not in edge_data: | |
edge_data['papers'] = [] | |
if paper_id not in edge_data['papers']: | |
edge_data['papers'].append(paper_id) | |
else: | |
# Add new edge | |
self.author_graph.add_edge(author1, author2, weight=1, papers=[paper_id]) | |
return True | |
except Exception as e: | |
print(f"⚠️ Error adding collaboration {author1}-{author2}: {e}") | |
return False | |
def add_papers(self, papers: List[Dict]): | |
"""Add papers to the citation network""" | |
if not papers: | |
print("⚠️ No papers provided to add_papers") | |
return | |
processed_count = 0 | |
error_count = 0 | |
print(f"📝 Processing {len(papers)} papers...") | |
for paper_idx, paper in enumerate(papers): | |
try: | |
# Validate paper input | |
if not isinstance(paper, dict): | |
print(f"⚠️ Paper {paper_idx} is not a dict: {type(paper)}") | |
error_count += 1 | |
continue | |
# Generate paper ID | |
paper_id = paper.get('paper_id') | |
if not paper_id: | |
paper_id = paper.get('url', '') | |
if not paper_id: | |
title = paper.get('title', f'Unknown_{paper_idx}') | |
paper_id = f"paper_{abs(hash(title)) % 1000000}" | |
# Store paper data | |
self.paper_data[paper_id] = { | |
'title': paper.get('title', ''), | |
'authors': self._safe_get_authors(paper), | |
'year': paper.get('year'), | |
'venue': paper.get('venue', ''), | |
'citation_count': paper.get('citation_count', 0), | |
'source': paper.get('source', ''), | |
'url': paper.get('url', ''), | |
'abstract': paper.get('abstract', '') | |
} | |
# Add to citation graph | |
self.citation_graph.add_node(paper_id, **self.paper_data[paper_id]) | |
# Process authors | |
authors = self._safe_get_authors(paper) | |
citation_count = paper.get('citation_count', 0) | |
# Validate citation count | |
if not isinstance(citation_count, (int, float)): | |
citation_count = 0 | |
# Add authors | |
valid_authors = [] | |
for author in authors: | |
if self._safe_add_author(author, paper_id, citation_count): | |
valid_authors.append(author) | |
# Add collaborations | |
for i, author1 in enumerate(valid_authors): | |
for j, author2 in enumerate(valid_authors): | |
if i < j: # Avoid duplicates and self-loops | |
self._safe_add_collaboration(author1, author2, paper_id) | |
processed_count += 1 | |
except Exception as e: | |
print(f"⚠️ Error processing paper {paper_idx}: {e}") | |
error_count += 1 | |
continue | |
print(f"✅ Successfully processed {processed_count} papers ({error_count} errors)") | |
def analyze_author_network(self) -> Dict: | |
"""Analyze author collaboration network""" | |
try: | |
if len(self.author_graph.nodes) == 0: | |
return {'error': 'No authors in network'} | |
# Basic network metrics | |
metrics = { | |
'total_authors': len(self.author_graph.nodes), | |
'total_collaborations': len(self.author_graph.edges), | |
'network_density': nx.density(self.author_graph), | |
'number_of_components': nx.number_connected_components(self.author_graph), | |
'largest_component_size': len(max(nx.connected_components(self.author_graph), key=len)) if nx.number_connected_components(self.author_graph) > 0 else 0 | |
} | |
# Most collaborative authors | |
collaboration_counts = {node: self.author_graph.degree(node) for node in self.author_graph.nodes} | |
top_collaborators = sorted(collaboration_counts.items(), key=lambda x: x[1], reverse=True)[:10] | |
# Most productive authors | |
productivity = {} | |
for author, data in self.author_data.items(): | |
productivity[author] = len(data.get('papers', [])) | |
top_productive = sorted(productivity.items(), key=lambda x: x[1], reverse=True)[:10] | |
# Most cited authors | |
citation_counts = {} | |
for author, data in self.author_data.items(): | |
citation_counts[author] = data.get('total_citations', 0) | |
top_cited = sorted(citation_counts.items(), key=lambda x: x[1], reverse=True)[:10] | |
return { | |
'network_metrics': metrics, | |
'top_collaborators': top_collaborators, | |
'top_productive_authors': top_productive, | |
'top_cited_authors': top_cited, | |
'analysis_timestamp': datetime.now().isoformat() | |
} | |
except Exception as e: | |
return { | |
'error': str(e), | |
'analysis_timestamp': datetime.now().isoformat() | |
} | |
def analyze_paper_network(self) -> Dict: | |
"""Analyze paper citation network""" | |
try: | |
if len(self.citation_graph.nodes) == 0: | |
return {'error': 'No papers in network'} | |
# Basic network metrics | |
metrics = { | |
'total_papers': len(self.citation_graph.nodes), | |
'total_citations': len(self.citation_graph.edges), | |
'network_density': nx.density(self.citation_graph), | |
'number_of_components': nx.number_weakly_connected_components(self.citation_graph), | |
'largest_component_size': len(max(nx.weakly_connected_components(self.citation_graph), key=len)) if nx.number_weakly_connected_components(self.citation_graph) > 0 else 0 | |
} | |
# Most cited papers | |
in_degree = dict(self.citation_graph.in_degree()) | |
most_cited = sorted(in_degree.items(), key=lambda x: x[1], reverse=True)[:10] | |
# Most citing papers | |
out_degree = dict(self.citation_graph.out_degree()) | |
most_citing = sorted(out_degree.items(), key=lambda x: x[1], reverse=True)[:10] | |
# Convert paper IDs to titles for readability | |
most_cited_titles = [] | |
for paper_id, count in most_cited: | |
if paper_id in self.paper_data: | |
most_cited_titles.append((self.paper_data[paper_id]['title'], count)) | |
else: | |
most_cited_titles.append((paper_id, count)) | |
most_citing_titles = [] | |
for paper_id, count in most_citing: | |
if paper_id in self.paper_data: | |
most_citing_titles.append((self.paper_data[paper_id]['title'], count)) | |
else: | |
most_citing_titles.append((paper_id, count)) | |
return { | |
'network_metrics': metrics, | |
'most_cited_papers': most_cited_titles, | |
'most_citing_papers': most_citing_titles, | |
'analysis_timestamp': datetime.now().isoformat() | |
} | |
except Exception as e: | |
return { | |
'error': str(e), | |
'analysis_timestamp': datetime.now().isoformat() | |
} | |
def get_network_summary(self) -> Dict: | |
"""Get comprehensive network summary""" | |
try: | |
author_analysis = self.analyze_author_network() | |
paper_analysis = self.analyze_paper_network() | |
return { | |
'author_network': author_analysis, | |
'paper_network': paper_analysis, | |
'overall_stats': { | |
'total_papers': len(self.paper_data), | |
'total_authors': len(self.author_data), | |
'papers_per_author': len(self.paper_data) / max(len(self.author_data), 1), | |
'collaborations_per_author': len(self.author_graph.edges) / max(len(self.author_graph.nodes), 1) | |
}, | |
'analysis_timestamp': datetime.now().isoformat() | |
} | |
except Exception as e: | |
return { | |
'error': str(e), | |
'analysis_timestamp': datetime.now().isoformat() | |
} |