Spaces:

tsrivallabh
/

Linkedin-Assistant

Sleeping

App Files Files Community

Linkedin-Assistant / profile_preprocessing.py

tsrivallabh

Upload 10 files

5318b09 verified 18 days ago

raw

history blame contribute delete

5.93 kB

	from typing import List, Dict, Any
	from urllib.parse import urlparse
	# ========== 3. PROFILE PREPROCESSING HELPERS ==========
	def normalize_url(url):
	return url.strip().rstrip('/')

	def summarize_skills(skills: List[Dict]) -> str:
	return ', '.join([s.get('title', '') for s in skills if s.get('title')])

	def summarize_projects(projects: List[Dict]) -> str:
	summaries = []
	for p in projects:
	title = p.get('title', '')
	desc = ''
	if p.get('subComponents'):
	for comp in p['subComponents']:
	for d in comp.get('description', []):
	if d.get('type') == 'textComponent':
	desc += d.get('text', '') + ' '
	summaries.append(f"{title}: {desc.strip()}")
	return '\n'.join(summaries)

	def summarize_educations(educations: List[Dict]) -> str:
	return ', '.join([
	f"{e.get('title', '')} ({e.get('subtitle', '')}, {e.get('caption', '')})"
	for e in educations if e.get('title')
	])

	def summarize_certs(certs: List[Dict]) -> str:
	return ', '.join([
	f"{c.get('title', '')} ({c.get('subtitle', '')}, {c.get('caption', '')})"
	for c in certs if c.get('title')
	])

	def summarize_test_scores(scores: List[Dict]) -> str:
	return ', '.join([
	f"{s.get('title', '')} ({s.get('subtitle', '')})"
	for s in scores if s.get('title')
	])

	def summarize_generic(items: List[Dict], key='title') -> str:
	return ', '.join([item.get(key, '') for item in items if item.get(key)])


	# === Preprocess raw profile into summarized profile ===
	def preprocess_profile(raw_profile: Dict[str, Any]) -> Dict[str, str]:
	return {
	"FullName": raw_profile.get("fullName", ""),
	"profile_url": raw_profile.get("linkedinUrl",""),
	"Headline": raw_profile.get("headline", ""),
	"JobTitle": raw_profile.get("jobTitle", ""),
	"CompanyName": raw_profile.get("companyName", ""),
	"CompanyIndustry": raw_profile.get("companyIndustry", ""),
	"CurrentJobDuration": str(raw_profile.get("currentJobDuration", "")),
	"About": raw_profile.get("about", ""),
	"Experiences": summarize_generic(raw_profile.get("experiences", []), key='title'),
	"Skills": summarize_skills(raw_profile.get("skills", [])),
	"Educations": summarize_educations(raw_profile.get("educations", [])),
	"Certifications": summarize_certs(raw_profile.get("licenseAndCertificates", [])),
	"HonorsAndAwards": summarize_generic(raw_profile.get("honorsAndAwards", []), key='title'),
	"Verifications": summarize_generic(raw_profile.get("verifications", []), key='title'),
	"Highlights": summarize_generic(raw_profile.get("highlights", []), key='title'),
	"Projects": summarize_projects(raw_profile.get("projects", [])),
	"Publications": summarize_generic(raw_profile.get("publications", []), key='title'),
	"Patents": summarize_generic(raw_profile.get("patents", []), key='title'),
	"Courses": summarize_generic(raw_profile.get("courses", []), key='title'),
	"TestScores": summarize_test_scores(raw_profile.get("testScores", []))
	}

	# === Create & fill state ===


	def initialize_state(raw_profile: Dict[str, Any]) -> Dict[str,Any]:
	"""
	Initializes the chatbot state used in LangGraph:
	- Keeps both raw and processed profile
	- Splits important sections for quick access
	- Initializes placeholders for tool outputs
	- Adds empty chat history for conversation context
	"""
	# Your preprocessing function that cleans / normalizes scraped profile
	profile = preprocess_profile(raw_profile)
	print(f"initializing url as {profile['profile_url']}")

	state: Dict[str, Any] = {
	"profile": profile, # Cleaned & normalized profile
	"profile_url": normalize_url(profile.get("profile_url","") or ""),

	# === Separate sections (make sure all are strings, never None) ===
	"sections": {
	"about": profile.get("About", "") or "",
	"headline": profile.get("Headline", "") or "",
	"skills": profile.get("Skills", "") or "",
	"projects": profile.get("Projects", "") or "",
	"educations": profile.get("Educations", "") or "",
	"certifications": profile.get("Certifications", "") or "",
	"honors_and_awards": profile.get("HonorsAndAwards", "") or "",
	"experiences": profile.get("Experiences", "") or "",
	"publications": profile.get("Publications", "") or "",
	"patents": profile.get("Patents", "") or "",
	"courses": profile.get("Courses", "") or "",
	"test_scores": profile.get("TestScores", "") or "",
	"verifications": profile.get("Verifications", "") or "",
	"highlights": profile.get("Highlights", "") or "",
	"job_title": profile.get("JobTitle", "") or "",
	"company_name": profile.get("CompanyName", "") or "",
	"company_industry": profile.get("CompanyIndustry", "") or "",
	"current_job_duration": profile.get("CurrentJobDuration", "") or "",
	"full_name": profile.get("FullName", "") or ""
	},

	# === Placeholders populated by tools ===
	"enhanced_content": {}, # Populated by ContentGenerator tool
	"profile_analysis": None, # Can be None initially (Optional)
	"job_fit": None, # Can be None initially (Optional)
	"target_role": None, # Optional[str]
	"editing_section": None, # Optional[str]

	# === Chat history ===
	# Pydantic expects list of dicts like {"role": "user", "content": "..."}
	"messages": [],
	"next_tool_name": None
	}


	return state