Spaces:

pquiggles
/

vsp-demo

Runtime error

App Files Files Community

vsp-demo / src /vsp /app /main.py

pquiggles

fix after merge

819725d about 1 year ago

raw

history blame contribute delete

13.6 kB

	"""
	main.py

	This module serves as the main executable file entrypoint for the VSP Data Enrichment project.
	It provides functionality to process LinkedIn profiles and classify various aspects of a person's
	educational and professional background.

	The main class, VspDataEnrichment, encapsulates all the necessary classifiers and methods
	to perform a comprehensive analysis of a LinkedIn profile.

	Usage:
	from vsp.app.main import VspDataEnrichment

	vsp_enrichment = VspDataEnrichment()
	results = await vsp_enrichment.process_linkedin_profile(linkedin_profile)

	"""

	import asyncio
	import calendar
	from collections import defaultdict
	from datetime import date
	from typing import List, Mapping, Sequence

	from pydantic import BaseModel, Field

	from vsp.app.classifiers.education_classifier import EducationClassification, EducationClassifier
	from vsp.app.classifiers.work_experience.general_work_experience_classifier import (
	PrimaryJobType,
	SecondaryJobType,
	WorkExperienceClassification,
	WorkExperienceClassifier,
	)
	from vsp.app.classifiers.work_experience.investing_focus_asset_class_classifier import (
	InvestingFocusAssetClassClassification,
	InvestingFocusAssetClassClassifier,
	)
	from vsp.app.classifiers.work_experience.investing_focus_sector_classifier import (
	InvestingFocusSectorClassification,
	InvestingFocusSectorClassifier,
	)
	from vsp.app.classifiers.work_experience.investment_banking_group_classifier import (
	InvestmentBankingGroupClassification,
	InvestmentBankingGroupClassifier,
	)
	from vsp.app.model.linkedin.linkedin_models import Education, LinkedinProfile, Position


	class ClassifiedEducation(BaseModel):
	"""
	Represents a classified education item from a LinkedIn profile.

	Attributes:
	education (Education): The original education item from the LinkedIn profile.
	classification (EducationClassification): The classification results for the education item.
	"""

	education: Education
	classification: EducationClassification


	class ClassifiedWorkExperience(BaseModel):
	"""
	Represents a classified work experience item from a LinkedIn profile.

	Attributes:
	position (Position): The original position item from the LinkedIn profile.
	work_experience_classification (WorkExperienceClassification): The general classification results
	for the work experience.
	investment_banking_classification (
	InvestmentBankingGroupClassification \| None
	): The investment banking classification results, if applicable.
	investing_focus_asset_class_classification (
	InvestingFocusAssetClassClassification \| None
	): The investing focus asset class classification results, if applicable.
	investing_focus_sector_classification (
	InvestingFocusSectorClassification \| None
	): The investing focus sector classification results, if applicable.
	"""

	position: Position
	work_experience_classification: WorkExperienceClassification
	investment_banking_classification: InvestmentBankingGroupClassification \| None = None
	investing_focus_asset_class_classification: InvestingFocusAssetClassClassification \| None = None
	investing_focus_sector_classification: InvestingFocusSectorClassification \| None = None


	class LinkedinProfileClassificationResults(BaseModel):
	"""
	Represents the complete classification results for a LinkedIn profile.

	Attributes:
	classified_educations (Sequence[ClassifiedEducation]): A sequence of classified education items.
	classified_work_experiences (Sequence[ClassifiedWorkExperience]): A sequence of classified work
	experience items.
	"""

	classified_educations: Sequence[ClassifiedEducation] = Field(default_factory=list)
	classified_work_experiences: Sequence[ClassifiedWorkExperience] = Field(default_factory=list)
	full_time_work_experience_years: float = Field(default=0.0)
	full_time_work_experience_by_secondary: Mapping[SecondaryJobType, float] = Field(default_factory=dict)


	class VspDataEnrichment:
	"""
	Main class for the VSP Data Enrichment project.

	This class encapsulates all the necessary classifiers and methods to process
	and enrich LinkedIn profile data with various classifications.

	Attributes:
	education_classifier (EducationClassifier): Classifier for education items.
	work_experience_classifier (WorkExperienceClassifier): Classifier for general work experiences.
	investment_banking_classifier (InvestmentBankingGroupClassifier): Classifier for investment banking groups.
	investing_focus_asset_class_classifier (
	InvestingFocusAssetClassClassifier
	): Classifier for investing focus asset classes.
	investing_focus_sector_classifier (InvestingFocusSectorClassifier): Classifier for investing focus sectors.
	"""

	def __init__(self) -> None:
	"""Initialize the VspDataEnrichment class with all required classifiers."""
	self._education_classifier = EducationClassifier()
	self._work_experience_classifier = WorkExperienceClassifier()
	self._investment_banking_classifier = InvestmentBankingGroupClassifier()
	self._investing_focus_asset_class_classifier = InvestingFocusAssetClassClassifier()
	self._investing_focus_sector_classifier = InvestingFocusSectorClassifier()

	def estimate_full_time_experience_by_secondary_job_type(
	self, classified_work_experiences: List[ClassifiedWorkExperience]
	) -> Mapping[SecondaryJobType, float]:
	# Define current date
	current_date = date(2024, 9, 18)

	# List to store all events (start or end of intervals)
	events = []

	# Set to store all observed SecondaryJobTypes
	observed_secondary_job_types = set()

	for cwe in classified_work_experiences:
	classification = cwe.work_experience_classification.primary_job_type
	secondary_job_type = cwe.work_experience_classification.secondary_job_type

	if classification == PrimaryJobType.FULL_TIME and secondary_job_type:
	# Normalize start date
	start = cwe.position.start
	if not start or not start.year:
	continue
	start_year = start.year
	start_month = start.month if start.month else 1
	start_day = start.day if start.day else 1
	start_date = date(start_year, start_month, start_day)

	# Normalize end date
	end = cwe.position.end
	if end is None:
	end_date = current_date
	else:
	if not end.year:
	continue
	end_year = end.year
	end_month = end.month if end.month else 12
	if end.day:
	end_day = end.day
	else:
	# Get last day of the month
	_, end_day = calendar.monthrange(end_year, end_month)
	end_date = date(end_year, end_month, end_day)

	if start_date > end_date:
	continue # Skip invalid intervals

	# Add events for sweep-line algorithm
	events.append((start_date, "start", secondary_job_type))
	events.append((end_date, "end", secondary_job_type))

	observed_secondary_job_types.add(secondary_job_type)

	# Sort events by date
	events.sort(key=lambda x: x[0])

	active_secondary_job_types = set()
	last_date = None
	durations = defaultdict(int) # in days

	for event_date, event_type, secondary_job_type in events:
	if last_date is not None and event_date > last_date:
	interval_duration = (event_date - last_date).days
	# Distribute the interval_duration among active_secondary_job_types
	for active_type in active_secondary_job_types:
	durations[active_type] += interval_duration

	if event_type == "start":
	active_secondary_job_types.add(secondary_job_type)
	elif event_type == "end":
	active_secondary_job_types.discard(secondary_job_type)

	last_date = event_date

	# Convert durations from days to years
	durations_in_years = {stype: round(days / 365.25, 2) for stype, days in durations.items()}

	return durations_in_years

	async def process_linkedin_profile(self, profile: LinkedinProfile) -> LinkedinProfileClassificationResults:
	"""
	Process a LinkedIn profile and classify its education and work experiences.

	This method maintains the original order of educations and work experiences
	from the input profile while performing asynchronous classification tasks.

	Args:
	profile (LinkedinProfile): The LinkedIn profile to process.

	Returns:
	LinkedinProfileClassificationResults: The comprehensive classification results for the profile.
	"""
	# Create tasks for education classification
	education_tasks = {
	education: self._education_classifier.classify_education(profile, education)
	for education in profile.educations
	}

	# Create tasks for work experience classification
	work_experience_tasks = {
	position: self._work_experience_classifier.classify_work_experience(profile, position)
	for position in profile.positions
	}

	# Wait for all education and work experience classifications to complete
	education_results = await asyncio.gather(*education_tasks.values())
	work_experience_results = await asyncio.gather(*work_experience_tasks.values())

	# Create ClassifiedEducation objects in the original order
	classified_educations = [
	ClassifiedEducation(education=education, classification=classification)
	for education, classification in zip(profile.educations, education_results)
	]

	# Process work experiences and create ClassifiedWorkExperience objects
	classified_work_experiences = []
	for position, work_classification in zip(profile.positions, work_experience_results):
	classified_work_experience = ClassifiedWorkExperience(
	position=position, work_experience_classification=work_classification
	)

	if work_classification.primary_job_type not in {
	work_classification.primary_job_type.INTERNSHIP,
	work_classification.primary_job_type.EXTRACURRICULAR,
	}:
	if work_classification.secondary_job_type == work_classification.secondary_job_type.INVESTMENT_BANKING:
	ib_classification = await self._investment_banking_classifier.classify_investment_banking_group(
	profile, position
	)
	classified_work_experience.investment_banking_classification = ib_classification

	if (
	work_classification.secondary_job_type == work_classification.secondary_job_type.INVESTING
	and not work_classification.primary_job_type
	== work_classification.primary_job_type.ADVISORY_BOARD_INVESTOR
	):
	asset_class_task = (
	self._investing_focus_asset_class_classifier.classify_investing_focus_asset_class(
	profile, position
	)
	)
	sector_task = self._investing_focus_sector_classifier.classify_investing_focus_sector(
	profile, position
	)

	asset_class_result, sector_result = await asyncio.gather(asset_class_task, sector_task)

	classified_work_experience.investing_focus_asset_class_classification = asset_class_result
	classified_work_experience.investing_focus_sector_classification = sector_result

	classified_work_experiences.append(classified_work_experience)

	experience_by_job_type = self.estimate_full_time_experience_by_secondary_job_type(classified_work_experiences)
	total_work_experience = sum(experience_by_job_type.values())
	return LinkedinProfileClassificationResults(
	classified_educations=classified_educations,
	classified_work_experiences=classified_work_experiences,
	full_time_work_experience_years=total_work_experience,
	full_time_work_experience_by_secondary=experience_by_job_type,
	)


	async def main() -> None:
	"""
	Main function to demonstrate the usage of VspDataEnrichment.

	This function loads a sample LinkedIn profile from a JSON file,
	processes it using the VspDataEnrichment class, and prints the results.
	"""
	import json

	# Load a sample LinkedIn profile
	with open("tests/test_data/sample_profiles/eric_armagost.json") as f:
	profile_data = json.load(f)
	profile = LinkedinProfile.model_validate(profile_data)

	# Create an instance of VspDataEnrichment and process the profile
	vsp_enrichment = VspDataEnrichment()
	results = await vsp_enrichment.process_linkedin_profile(profile)

	# Print the results
	print(results.model_dump_json(indent=2))


	if __name__ == "__main__":
	asyncio.run(main())