import networkx as nx import json from datetime import datetime from typing import List, Dict, Any import matplotlib.pyplot as plt from collections import defaultdict class CitationNetworkAnalyzer: """Analyze citation networks and author collaborations - Web App Version""" def __init__(self): self.reset() print("✅ Citation network analyzer initialized (web app version)!") def reset(self): """Reset all data structures""" self.citation_graph = nx.DiGraph() self.author_graph = nx.Graph() self.paper_data = {} self.author_data = {} print("🔄 Citation network analyzer reset") def _safe_get_authors(self, paper: Dict) -> List[str]: """Safely extract and normalize author list from paper""" authors = paper.get('authors', []) # Handle None if authors is None: return [] # Handle string (comma-separated) if isinstance(authors, str): if not authors.strip(): return [] return [a.strip() for a in authors.split(',') if a.strip()] # Handle list if isinstance(authors, list): result = [] for author in authors: if isinstance(author, str) and author.strip(): result.append(author.strip()) elif isinstance(author, dict): # Handle author objects with 'name' field name = author.get('name', '') or author.get('authorId', '') if name and isinstance(name, str): result.append(name.strip()) return result # Unknown format return [] def _safe_add_author(self, author_name: str, paper_id: str, citation_count: int = 0): """Safely add author to the graph""" try: # Initialize author data if not exists if author_name not in self.author_data: self.author_data[author_name] = { 'papers': [], 'total_citations': 0 } # Add to NetworkX graph if not exists if not self.author_graph.has_node(author_name): self.author_graph.add_node(author_name) # Update author data if paper_id not in self.author_data[author_name]['papers']: self.author_data[author_name]['papers'].append(paper_id) self.author_data[author_name]['total_citations'] += citation_count return True except Exception as e: print(f"⚠️ Error adding author {author_name}: {e}") return False def _safe_add_collaboration(self, author1: str, author2: str, paper_id: str): """Safely add collaboration edge between authors""" try: # Ensure both authors exist if not self.author_graph.has_node(author1): self.author_graph.add_node(author1) if not self.author_graph.has_node(author2): self.author_graph.add_node(author2) # Add or update edge if self.author_graph.has_edge(author1, author2): # Update existing edge edge_data = self.author_graph.edges[author1, author2] edge_data['weight'] = edge_data.get('weight', 0) + 1 if 'papers' not in edge_data: edge_data['papers'] = [] if paper_id not in edge_data['papers']: edge_data['papers'].append(paper_id) else: # Add new edge self.author_graph.add_edge(author1, author2, weight=1, papers=[paper_id]) return True except Exception as e: print(f"⚠️ Error adding collaboration {author1}-{author2}: {e}") return False def add_papers(self, papers: List[Dict]): """Add papers to the citation network""" if not papers: print("⚠️ No papers provided to add_papers") return processed_count = 0 error_count = 0 print(f"📝 Processing {len(papers)} papers...") for paper_idx, paper in enumerate(papers): try: # Validate paper input if not isinstance(paper, dict): print(f"⚠️ Paper {paper_idx} is not a dict: {type(paper)}") error_count += 1 continue # Generate paper ID paper_id = paper.get('paper_id') if not paper_id: paper_id = paper.get('url', '') if not paper_id: title = paper.get('title', f'Unknown_{paper_idx}') paper_id = f"paper_{abs(hash(title)) % 1000000}" # Store paper data self.paper_data[paper_id] = { 'title': paper.get('title', ''), 'authors': self._safe_get_authors(paper), 'year': paper.get('year'), 'venue': paper.get('venue', ''), 'citation_count': paper.get('citation_count', 0), 'source': paper.get('source', ''), 'url': paper.get('url', ''), 'abstract': paper.get('abstract', '') } # Add to citation graph self.citation_graph.add_node(paper_id, **self.paper_data[paper_id]) # Process authors authors = self._safe_get_authors(paper) citation_count = paper.get('citation_count', 0) # Validate citation count if not isinstance(citation_count, (int, float)): citation_count = 0 # Add authors valid_authors = [] for author in authors: if self._safe_add_author(author, paper_id, citation_count): valid_authors.append(author) # Add collaborations for i, author1 in enumerate(valid_authors): for j, author2 in enumerate(valid_authors): if i < j: # Avoid duplicates and self-loops self._safe_add_collaboration(author1, author2, paper_id) processed_count += 1 except Exception as e: print(f"⚠️ Error processing paper {paper_idx}: {e}") error_count += 1 continue print(f"✅ Successfully processed {processed_count} papers ({error_count} errors)") def analyze_author_network(self) -> Dict: """Analyze author collaboration network""" try: if len(self.author_graph.nodes) == 0: return {'error': 'No authors in network'} # Basic network metrics metrics = { 'total_authors': len(self.author_graph.nodes), 'total_collaborations': len(self.author_graph.edges), 'network_density': nx.density(self.author_graph), 'number_of_components': nx.number_connected_components(self.author_graph), 'largest_component_size': len(max(nx.connected_components(self.author_graph), key=len)) if nx.number_connected_components(self.author_graph) > 0 else 0 } # Most collaborative authors collaboration_counts = {node: self.author_graph.degree(node) for node in self.author_graph.nodes} top_collaborators = sorted(collaboration_counts.items(), key=lambda x: x[1], reverse=True)[:10] # Most productive authors productivity = {} for author, data in self.author_data.items(): productivity[author] = len(data.get('papers', [])) top_productive = sorted(productivity.items(), key=lambda x: x[1], reverse=True)[:10] # Most cited authors citation_counts = {} for author, data in self.author_data.items(): citation_counts[author] = data.get('total_citations', 0) top_cited = sorted(citation_counts.items(), key=lambda x: x[1], reverse=True)[:10] return { 'network_metrics': metrics, 'top_collaborators': top_collaborators, 'top_productive_authors': top_productive, 'top_cited_authors': top_cited, 'analysis_timestamp': datetime.now().isoformat() } except Exception as e: return { 'error': str(e), 'analysis_timestamp': datetime.now().isoformat() } def analyze_paper_network(self) -> Dict: """Analyze paper citation network""" try: if len(self.citation_graph.nodes) == 0: return {'error': 'No papers in network'} # Basic network metrics metrics = { 'total_papers': len(self.citation_graph.nodes), 'total_citations': len(self.citation_graph.edges), 'network_density': nx.density(self.citation_graph), 'number_of_components': nx.number_weakly_connected_components(self.citation_graph), 'largest_component_size': len(max(nx.weakly_connected_components(self.citation_graph), key=len)) if nx.number_weakly_connected_components(self.citation_graph) > 0 else 0 } # Most cited papers in_degree = dict(self.citation_graph.in_degree()) most_cited = sorted(in_degree.items(), key=lambda x: x[1], reverse=True)[:10] # Most citing papers out_degree = dict(self.citation_graph.out_degree()) most_citing = sorted(out_degree.items(), key=lambda x: x[1], reverse=True)[:10] # Convert paper IDs to titles for readability most_cited_titles = [] for paper_id, count in most_cited: if paper_id in self.paper_data: most_cited_titles.append((self.paper_data[paper_id]['title'], count)) else: most_cited_titles.append((paper_id, count)) most_citing_titles = [] for paper_id, count in most_citing: if paper_id in self.paper_data: most_citing_titles.append((self.paper_data[paper_id]['title'], count)) else: most_citing_titles.append((paper_id, count)) return { 'network_metrics': metrics, 'most_cited_papers': most_cited_titles, 'most_citing_papers': most_citing_titles, 'analysis_timestamp': datetime.now().isoformat() } except Exception as e: return { 'error': str(e), 'analysis_timestamp': datetime.now().isoformat() } def get_network_summary(self) -> Dict: """Get comprehensive network summary""" try: author_analysis = self.analyze_author_network() paper_analysis = self.analyze_paper_network() return { 'author_network': author_analysis, 'paper_network': paper_analysis, 'overall_stats': { 'total_papers': len(self.paper_data), 'total_authors': len(self.author_data), 'papers_per_author': len(self.paper_data) / max(len(self.author_data), 1), 'collaborations_per_author': len(self.author_graph.edges) / max(len(self.author_graph.nodes), 1) }, 'analysis_timestamp': datetime.now().isoformat() } except Exception as e: return { 'error': str(e), 'analysis_timestamp': datetime.now().isoformat() }