""" main.py This module serves as the main executable file entrypoint for the VSP Data Enrichment project. It provides functionality to process LinkedIn profiles and classify various aspects of a person's educational and professional background. The main class, VspDataEnrichment, encapsulates all the necessary classifiers and methods to perform a comprehensive analysis of a LinkedIn profile. Usage: from vsp.app.main import VspDataEnrichment vsp_enrichment = VspDataEnrichment() results = await vsp_enrichment.process_linkedin_profile(linkedin_profile) """ import asyncio import calendar from collections import defaultdict from datetime import date from typing import List, Mapping, Sequence from pydantic import BaseModel, Field from vsp.app.classifiers.education_classifier import EducationClassification, EducationClassifier from vsp.app.classifiers.work_experience.general_work_experience_classifier import ( PrimaryJobType, SecondaryJobType, WorkExperienceClassification, WorkExperienceClassifier, ) from vsp.app.classifiers.work_experience.investing_focus_asset_class_classifier import ( InvestingFocusAssetClassClassification, InvestingFocusAssetClassClassifier, ) from vsp.app.classifiers.work_experience.investing_focus_sector_classifier import ( InvestingFocusSectorClassification, InvestingFocusSectorClassifier, ) from vsp.app.classifiers.work_experience.investment_banking_group_classifier import ( InvestmentBankingGroupClassification, InvestmentBankingGroupClassifier, ) from vsp.app.model.linkedin.linkedin_models import Education, LinkedinProfile, Position class ClassifiedEducation(BaseModel): """ Represents a classified education item from a LinkedIn profile. Attributes: education (Education): The original education item from the LinkedIn profile. classification (EducationClassification): The classification results for the education item. """ education: Education classification: EducationClassification class ClassifiedWorkExperience(BaseModel): """ Represents a classified work experience item from a LinkedIn profile. Attributes: position (Position): The original position item from the LinkedIn profile. work_experience_classification (WorkExperienceClassification): The general classification results for the work experience. investment_banking_classification ( InvestmentBankingGroupClassification | None ): The investment banking classification results, if applicable. investing_focus_asset_class_classification ( InvestingFocusAssetClassClassification | None ): The investing focus asset class classification results, if applicable. investing_focus_sector_classification ( InvestingFocusSectorClassification | None ): The investing focus sector classification results, if applicable. """ position: Position work_experience_classification: WorkExperienceClassification investment_banking_classification: InvestmentBankingGroupClassification | None = None investing_focus_asset_class_classification: InvestingFocusAssetClassClassification | None = None investing_focus_sector_classification: InvestingFocusSectorClassification | None = None class LinkedinProfileClassificationResults(BaseModel): """ Represents the complete classification results for a LinkedIn profile. Attributes: classified_educations (Sequence[ClassifiedEducation]): A sequence of classified education items. classified_work_experiences (Sequence[ClassifiedWorkExperience]): A sequence of classified work experience items. """ classified_educations: Sequence[ClassifiedEducation] = Field(default_factory=list) classified_work_experiences: Sequence[ClassifiedWorkExperience] = Field(default_factory=list) full_time_work_experience_years: float = Field(default=0.0) full_time_work_experience_by_secondary: Mapping[SecondaryJobType, float] = Field(default_factory=dict) class VspDataEnrichment: """ Main class for the VSP Data Enrichment project. This class encapsulates all the necessary classifiers and methods to process and enrich LinkedIn profile data with various classifications. Attributes: education_classifier (EducationClassifier): Classifier for education items. work_experience_classifier (WorkExperienceClassifier): Classifier for general work experiences. investment_banking_classifier (InvestmentBankingGroupClassifier): Classifier for investment banking groups. investing_focus_asset_class_classifier ( InvestingFocusAssetClassClassifier ): Classifier for investing focus asset classes. investing_focus_sector_classifier (InvestingFocusSectorClassifier): Classifier for investing focus sectors. """ def __init__(self) -> None: """Initialize the VspDataEnrichment class with all required classifiers.""" self._education_classifier = EducationClassifier() self._work_experience_classifier = WorkExperienceClassifier() self._investment_banking_classifier = InvestmentBankingGroupClassifier() self._investing_focus_asset_class_classifier = InvestingFocusAssetClassClassifier() self._investing_focus_sector_classifier = InvestingFocusSectorClassifier() def estimate_full_time_experience_by_secondary_job_type( self, classified_work_experiences: List[ClassifiedWorkExperience] ) -> Mapping[SecondaryJobType, float]: # Define current date current_date = date(2024, 9, 18) # List to store all events (start or end of intervals) events = [] # Set to store all observed SecondaryJobTypes observed_secondary_job_types = set() for cwe in classified_work_experiences: classification = cwe.work_experience_classification.primary_job_type secondary_job_type = cwe.work_experience_classification.secondary_job_type if classification == PrimaryJobType.FULL_TIME and secondary_job_type: # Normalize start date start = cwe.position.start if not start or not start.year: continue start_year = start.year start_month = start.month if start.month else 1 start_day = start.day if start.day else 1 start_date = date(start_year, start_month, start_day) # Normalize end date end = cwe.position.end if end is None: end_date = current_date else: if not end.year: continue end_year = end.year end_month = end.month if end.month else 12 if end.day: end_day = end.day else: # Get last day of the month _, end_day = calendar.monthrange(end_year, end_month) end_date = date(end_year, end_month, end_day) if start_date > end_date: continue # Skip invalid intervals # Add events for sweep-line algorithm events.append((start_date, "start", secondary_job_type)) events.append((end_date, "end", secondary_job_type)) observed_secondary_job_types.add(secondary_job_type) # Sort events by date events.sort(key=lambda x: x[0]) active_secondary_job_types = set() last_date = None durations = defaultdict(int) # in days for event_date, event_type, secondary_job_type in events: if last_date is not None and event_date > last_date: interval_duration = (event_date - last_date).days # Distribute the interval_duration among active_secondary_job_types for active_type in active_secondary_job_types: durations[active_type] += interval_duration if event_type == "start": active_secondary_job_types.add(secondary_job_type) elif event_type == "end": active_secondary_job_types.discard(secondary_job_type) last_date = event_date # Convert durations from days to years durations_in_years = {stype: round(days / 365.25, 2) for stype, days in durations.items()} return durations_in_years async def process_linkedin_profile(self, profile: LinkedinProfile) -> LinkedinProfileClassificationResults: """ Process a LinkedIn profile and classify its education and work experiences. This method maintains the original order of educations and work experiences from the input profile while performing asynchronous classification tasks. Args: profile (LinkedinProfile): The LinkedIn profile to process. Returns: LinkedinProfileClassificationResults: The comprehensive classification results for the profile. """ # Create tasks for education classification education_tasks = { education: self._education_classifier.classify_education(profile, education) for education in profile.educations } # Create tasks for work experience classification work_experience_tasks = { position: self._work_experience_classifier.classify_work_experience(profile, position) for position in profile.positions } # Wait for all education and work experience classifications to complete education_results = await asyncio.gather(*education_tasks.values()) work_experience_results = await asyncio.gather(*work_experience_tasks.values()) # Create ClassifiedEducation objects in the original order classified_educations = [ ClassifiedEducation(education=education, classification=classification) for education, classification in zip(profile.educations, education_results) ] # Process work experiences and create ClassifiedWorkExperience objects classified_work_experiences = [] for position, work_classification in zip(profile.positions, work_experience_results): classified_work_experience = ClassifiedWorkExperience( position=position, work_experience_classification=work_classification ) if work_classification.primary_job_type not in { work_classification.primary_job_type.INTERNSHIP, work_classification.primary_job_type.EXTRACURRICULAR, }: if work_classification.secondary_job_type == work_classification.secondary_job_type.INVESTMENT_BANKING: ib_classification = await self._investment_banking_classifier.classify_investment_banking_group( profile, position ) classified_work_experience.investment_banking_classification = ib_classification if ( work_classification.secondary_job_type == work_classification.secondary_job_type.INVESTING and not work_classification.primary_job_type == work_classification.primary_job_type.ADVISORY_BOARD_INVESTOR ): asset_class_task = ( self._investing_focus_asset_class_classifier.classify_investing_focus_asset_class( profile, position ) ) sector_task = self._investing_focus_sector_classifier.classify_investing_focus_sector( profile, position ) asset_class_result, sector_result = await asyncio.gather(asset_class_task, sector_task) classified_work_experience.investing_focus_asset_class_classification = asset_class_result classified_work_experience.investing_focus_sector_classification = sector_result classified_work_experiences.append(classified_work_experience) experience_by_job_type = self.estimate_full_time_experience_by_secondary_job_type(classified_work_experiences) total_work_experience = sum(experience_by_job_type.values()) return LinkedinProfileClassificationResults( classified_educations=classified_educations, classified_work_experiences=classified_work_experiences, full_time_work_experience_years=total_work_experience, full_time_work_experience_by_secondary=experience_by_job_type, ) async def main() -> None: """ Main function to demonstrate the usage of VspDataEnrichment. This function loads a sample LinkedIn profile from a JSON file, processes it using the VspDataEnrichment class, and prints the results. """ import json # Load a sample LinkedIn profile with open("tests/test_data/sample_profiles/eric_armagost.json") as f: profile_data = json.load(f) profile = LinkedinProfile.model_validate(profile_data) # Create an instance of VspDataEnrichment and process the profile vsp_enrichment = VspDataEnrichment() results = await vsp_enrichment.process_linkedin_profile(profile) # Print the results print(results.model_dump_json(indent=2)) if __name__ == "__main__": asyncio.run(main())