Spaces:

Ananthakr1shnan
/

ResearchMate

Sleeping

App Files Files Community

ResearchMate / src /components /citation_network.py

Ananthakr1shnan

Upload 80 files

519c06d verified about 1 month ago

raw

history blame contribute delete

12.3 kB

	import networkx as nx
	import json
	from datetime import datetime
	from typing import List, Dict, Any
	import matplotlib.pyplot as plt
	from collections import defaultdict

	class CitationNetworkAnalyzer:
	"""Analyze citation networks and author collaborations - Web App Version"""

	def __init__(self):
	self.reset()
	print("✅ Citation network analyzer initialized (web app version)!")

	def reset(self):
	"""Reset all data structures"""
	self.citation_graph = nx.DiGraph()
	self.author_graph = nx.Graph()
	self.paper_data = {}
	self.author_data = {}
	print("🔄 Citation network analyzer reset")

	def _safe_get_authors(self, paper: Dict) -> List[str]:
	"""Safely extract and normalize author list from paper"""
	authors = paper.get('authors', [])

	# Handle None
	if authors is None:
	return []

	# Handle string (comma-separated)
	if isinstance(authors, str):
	if not authors.strip():
	return []
	return [a.strip() for a in authors.split(',') if a.strip()]

	# Handle list
	if isinstance(authors, list):
	result = []
	for author in authors:
	if isinstance(author, str) and author.strip():
	result.append(author.strip())
	elif isinstance(author, dict):
	# Handle author objects with 'name' field
	name = author.get('name', '') or author.get('authorId', '')
	if name and isinstance(name, str):
	result.append(name.strip())
	return result

	# Unknown format
	return []

	def _safe_add_author(self, author_name: str, paper_id: str, citation_count: int = 0):
	"""Safely add author to the graph"""
	try:
	# Initialize author data if not exists
	if author_name not in self.author_data:
	self.author_data[author_name] = {
	'papers': [],
	'total_citations': 0
	}

	# Add to NetworkX graph if not exists
	if not self.author_graph.has_node(author_name):
	self.author_graph.add_node(author_name)

	# Update author data
	if paper_id not in self.author_data[author_name]['papers']:
	self.author_data[author_name]['papers'].append(paper_id)
	self.author_data[author_name]['total_citations'] += citation_count

	return True

	except Exception as e:
	print(f"⚠️ Error adding author {author_name}: {e}")
	return False

	def _safe_add_collaboration(self, author1: str, author2: str, paper_id: str):
	"""Safely add collaboration edge between authors"""
	try:
	# Ensure both authors exist
	if not self.author_graph.has_node(author1):
	self.author_graph.add_node(author1)
	if not self.author_graph.has_node(author2):
	self.author_graph.add_node(author2)

	# Add or update edge
	if self.author_graph.has_edge(author1, author2):
	# Update existing edge
	edge_data = self.author_graph.edges[author1, author2]
	edge_data['weight'] = edge_data.get('weight', 0) + 1
	if 'papers' not in edge_data:
	edge_data['papers'] = []
	if paper_id not in edge_data['papers']:
	edge_data['papers'].append(paper_id)
	else:
	# Add new edge
	self.author_graph.add_edge(author1, author2, weight=1, papers=[paper_id])

	return True

	except Exception as e:
	print(f"⚠️ Error adding collaboration {author1}-{author2}: {e}")
	return False

	def add_papers(self, papers: List[Dict]):
	"""Add papers to the citation network"""
	if not papers:
	print("⚠️ No papers provided to add_papers")
	return

	processed_count = 0
	error_count = 0

	print(f"📝 Processing {len(papers)} papers...")

	for paper_idx, paper in enumerate(papers):
	try:
	# Validate paper input
	if not isinstance(paper, dict):
	print(f"⚠️ Paper {paper_idx} is not a dict: {type(paper)}")
	error_count += 1
	continue

	# Generate paper ID
	paper_id = paper.get('paper_id')
	if not paper_id:
	paper_id = paper.get('url', '')
	if not paper_id:
	title = paper.get('title', f'Unknown_{paper_idx}')
	paper_id = f"paper_{abs(hash(title)) % 1000000}"

	# Store paper data
	self.paper_data[paper_id] = {
	'title': paper.get('title', ''),
	'authors': self._safe_get_authors(paper),
	'year': paper.get('year'),
	'venue': paper.get('venue', ''),
	'citation_count': paper.get('citation_count', 0),
	'source': paper.get('source', ''),
	'url': paper.get('url', ''),
	'abstract': paper.get('abstract', '')
	}

	# Add to citation graph
	self.citation_graph.add_node(paper_id, **self.paper_data[paper_id])

	# Process authors
	authors = self._safe_get_authors(paper)
	citation_count = paper.get('citation_count', 0)

	# Validate citation count
	if not isinstance(citation_count, (int, float)):
	citation_count = 0

	# Add authors
	valid_authors = []
	for author in authors:
	if self._safe_add_author(author, paper_id, citation_count):
	valid_authors.append(author)

	# Add collaborations
	for i, author1 in enumerate(valid_authors):
	for j, author2 in enumerate(valid_authors):
	if i < j: # Avoid duplicates and self-loops
	self._safe_add_collaboration(author1, author2, paper_id)

	processed_count += 1

	except Exception as e:
	print(f"⚠️ Error processing paper {paper_idx}: {e}")
	error_count += 1
	continue

	print(f"✅ Successfully processed {processed_count} papers ({error_count} errors)")

	def analyze_author_network(self) -> Dict:
	"""Analyze author collaboration network"""
	try:
	if len(self.author_graph.nodes) == 0:
	return {'error': 'No authors in network'}

	# Basic network metrics
	metrics = {
	'total_authors': len(self.author_graph.nodes),
	'total_collaborations': len(self.author_graph.edges),
	'network_density': nx.density(self.author_graph),
	'number_of_components': nx.number_connected_components(self.author_graph),
	'largest_component_size': len(max(nx.connected_components(self.author_graph), key=len)) if nx.number_connected_components(self.author_graph) > 0 else 0
	}

	# Most collaborative authors
	collaboration_counts = {node: self.author_graph.degree(node) for node in self.author_graph.nodes}
	top_collaborators = sorted(collaboration_counts.items(), key=lambda x: x[1], reverse=True)[:10]

	# Most productive authors
	productivity = {}
	for author, data in self.author_data.items():
	productivity[author] = len(data.get('papers', []))
	top_productive = sorted(productivity.items(), key=lambda x: x[1], reverse=True)[:10]

	# Most cited authors
	citation_counts = {}
	for author, data in self.author_data.items():
	citation_counts[author] = data.get('total_citations', 0)
	top_cited = sorted(citation_counts.items(), key=lambda x: x[1], reverse=True)[:10]

	return {
	'network_metrics': metrics,
	'top_collaborators': top_collaborators,
	'top_productive_authors': top_productive,
	'top_cited_authors': top_cited,
	'analysis_timestamp': datetime.now().isoformat()
	}

	except Exception as e:
	return {
	'error': str(e),
	'analysis_timestamp': datetime.now().isoformat()
	}

	def analyze_paper_network(self) -> Dict:
	"""Analyze paper citation network"""
	try:
	if len(self.citation_graph.nodes) == 0:
	return {'error': 'No papers in network'}

	# Basic network metrics
	metrics = {
	'total_papers': len(self.citation_graph.nodes),
	'total_citations': len(self.citation_graph.edges),
	'network_density': nx.density(self.citation_graph),
	'number_of_components': nx.number_weakly_connected_components(self.citation_graph),
	'largest_component_size': len(max(nx.weakly_connected_components(self.citation_graph), key=len)) if nx.number_weakly_connected_components(self.citation_graph) > 0 else 0
	}

	# Most cited papers
	in_degree = dict(self.citation_graph.in_degree())
	most_cited = sorted(in_degree.items(), key=lambda x: x[1], reverse=True)[:10]

	# Most citing papers
	out_degree = dict(self.citation_graph.out_degree())
	most_citing = sorted(out_degree.items(), key=lambda x: x[1], reverse=True)[:10]

	# Convert paper IDs to titles for readability
	most_cited_titles = []
	for paper_id, count in most_cited:
	if paper_id in self.paper_data:
	most_cited_titles.append((self.paper_data[paper_id]['title'], count))
	else:
	most_cited_titles.append((paper_id, count))

	most_citing_titles = []
	for paper_id, count in most_citing:
	if paper_id in self.paper_data:
	most_citing_titles.append((self.paper_data[paper_id]['title'], count))
	else:
	most_citing_titles.append((paper_id, count))

	return {
	'network_metrics': metrics,
	'most_cited_papers': most_cited_titles,
	'most_citing_papers': most_citing_titles,
	'analysis_timestamp': datetime.now().isoformat()
	}

	except Exception as e:
	return {
	'error': str(e),
	'analysis_timestamp': datetime.now().isoformat()
	}

	def get_network_summary(self) -> Dict:
	"""Get comprehensive network summary"""
	try:
	author_analysis = self.analyze_author_network()
	paper_analysis = self.analyze_paper_network()

	return {
	'author_network': author_analysis,
	'paper_network': paper_analysis,
	'overall_stats': {
	'total_papers': len(self.paper_data),
	'total_authors': len(self.author_data),
	'papers_per_author': len(self.paper_data) / max(len(self.author_data), 1),
	'collaborations_per_author': len(self.author_graph.edges) / max(len(self.author_graph.nodes), 1)
	},
	'analysis_timestamp': datetime.now().isoformat()
	}

	except Exception as e:
	return {
	'error': str(e),
	'analysis_timestamp': datetime.now().isoformat()
	}