vsp-demo / src /vsp /app /main.py
pquiggles's picture
fix after merge
819725d
"""
main.py
This module serves as the main executable file entrypoint for the VSP Data Enrichment project.
It provides functionality to process LinkedIn profiles and classify various aspects of a person's
educational and professional background.
The main class, VspDataEnrichment, encapsulates all the necessary classifiers and methods
to perform a comprehensive analysis of a LinkedIn profile.
Usage:
from vsp.app.main import VspDataEnrichment
vsp_enrichment = VspDataEnrichment()
results = await vsp_enrichment.process_linkedin_profile(linkedin_profile)
"""
import asyncio
import calendar
from collections import defaultdict
from datetime import date
from typing import List, Mapping, Sequence
from pydantic import BaseModel, Field
from vsp.app.classifiers.education_classifier import EducationClassification, EducationClassifier
from vsp.app.classifiers.work_experience.general_work_experience_classifier import (
PrimaryJobType,
SecondaryJobType,
WorkExperienceClassification,
WorkExperienceClassifier,
)
from vsp.app.classifiers.work_experience.investing_focus_asset_class_classifier import (
InvestingFocusAssetClassClassification,
InvestingFocusAssetClassClassifier,
)
from vsp.app.classifiers.work_experience.investing_focus_sector_classifier import (
InvestingFocusSectorClassification,
InvestingFocusSectorClassifier,
)
from vsp.app.classifiers.work_experience.investment_banking_group_classifier import (
InvestmentBankingGroupClassification,
InvestmentBankingGroupClassifier,
)
from vsp.app.model.linkedin.linkedin_models import Education, LinkedinProfile, Position
class ClassifiedEducation(BaseModel):
"""
Represents a classified education item from a LinkedIn profile.
Attributes:
education (Education): The original education item from the LinkedIn profile.
classification (EducationClassification): The classification results for the education item.
"""
education: Education
classification: EducationClassification
class ClassifiedWorkExperience(BaseModel):
"""
Represents a classified work experience item from a LinkedIn profile.
Attributes:
position (Position): The original position item from the LinkedIn profile.
work_experience_classification (WorkExperienceClassification): The general classification results
for the work experience.
investment_banking_classification (
InvestmentBankingGroupClassification | None
): The investment banking classification results, if applicable.
investing_focus_asset_class_classification (
InvestingFocusAssetClassClassification | None
): The investing focus asset class classification results, if applicable.
investing_focus_sector_classification (
InvestingFocusSectorClassification | None
): The investing focus sector classification results, if applicable.
"""
position: Position
work_experience_classification: WorkExperienceClassification
investment_banking_classification: InvestmentBankingGroupClassification | None = None
investing_focus_asset_class_classification: InvestingFocusAssetClassClassification | None = None
investing_focus_sector_classification: InvestingFocusSectorClassification | None = None
class LinkedinProfileClassificationResults(BaseModel):
"""
Represents the complete classification results for a LinkedIn profile.
Attributes:
classified_educations (Sequence[ClassifiedEducation]): A sequence of classified education items.
classified_work_experiences (Sequence[ClassifiedWorkExperience]): A sequence of classified work
experience items.
"""
classified_educations: Sequence[ClassifiedEducation] = Field(default_factory=list)
classified_work_experiences: Sequence[ClassifiedWorkExperience] = Field(default_factory=list)
full_time_work_experience_years: float = Field(default=0.0)
full_time_work_experience_by_secondary: Mapping[SecondaryJobType, float] = Field(default_factory=dict)
class VspDataEnrichment:
"""
Main class for the VSP Data Enrichment project.
This class encapsulates all the necessary classifiers and methods to process
and enrich LinkedIn profile data with various classifications.
Attributes:
education_classifier (EducationClassifier): Classifier for education items.
work_experience_classifier (WorkExperienceClassifier): Classifier for general work experiences.
investment_banking_classifier (InvestmentBankingGroupClassifier): Classifier for investment banking groups.
investing_focus_asset_class_classifier (
InvestingFocusAssetClassClassifier
): Classifier for investing focus asset classes.
investing_focus_sector_classifier (InvestingFocusSectorClassifier): Classifier for investing focus sectors.
"""
def __init__(self) -> None:
"""Initialize the VspDataEnrichment class with all required classifiers."""
self._education_classifier = EducationClassifier()
self._work_experience_classifier = WorkExperienceClassifier()
self._investment_banking_classifier = InvestmentBankingGroupClassifier()
self._investing_focus_asset_class_classifier = InvestingFocusAssetClassClassifier()
self._investing_focus_sector_classifier = InvestingFocusSectorClassifier()
def estimate_full_time_experience_by_secondary_job_type(
self, classified_work_experiences: List[ClassifiedWorkExperience]
) -> Mapping[SecondaryJobType, float]:
# Define current date
current_date = date(2024, 9, 18)
# List to store all events (start or end of intervals)
events = []
# Set to store all observed SecondaryJobTypes
observed_secondary_job_types = set()
for cwe in classified_work_experiences:
classification = cwe.work_experience_classification.primary_job_type
secondary_job_type = cwe.work_experience_classification.secondary_job_type
if classification == PrimaryJobType.FULL_TIME and secondary_job_type:
# Normalize start date
start = cwe.position.start
if not start or not start.year:
continue
start_year = start.year
start_month = start.month if start.month else 1
start_day = start.day if start.day else 1
start_date = date(start_year, start_month, start_day)
# Normalize end date
end = cwe.position.end
if end is None:
end_date = current_date
else:
if not end.year:
continue
end_year = end.year
end_month = end.month if end.month else 12
if end.day:
end_day = end.day
else:
# Get last day of the month
_, end_day = calendar.monthrange(end_year, end_month)
end_date = date(end_year, end_month, end_day)
if start_date > end_date:
continue # Skip invalid intervals
# Add events for sweep-line algorithm
events.append((start_date, "start", secondary_job_type))
events.append((end_date, "end", secondary_job_type))
observed_secondary_job_types.add(secondary_job_type)
# Sort events by date
events.sort(key=lambda x: x[0])
active_secondary_job_types = set()
last_date = None
durations = defaultdict(int) # in days
for event_date, event_type, secondary_job_type in events:
if last_date is not None and event_date > last_date:
interval_duration = (event_date - last_date).days
# Distribute the interval_duration among active_secondary_job_types
for active_type in active_secondary_job_types:
durations[active_type] += interval_duration
if event_type == "start":
active_secondary_job_types.add(secondary_job_type)
elif event_type == "end":
active_secondary_job_types.discard(secondary_job_type)
last_date = event_date
# Convert durations from days to years
durations_in_years = {stype: round(days / 365.25, 2) for stype, days in durations.items()}
return durations_in_years
async def process_linkedin_profile(self, profile: LinkedinProfile) -> LinkedinProfileClassificationResults:
"""
Process a LinkedIn profile and classify its education and work experiences.
This method maintains the original order of educations and work experiences
from the input profile while performing asynchronous classification tasks.
Args:
profile (LinkedinProfile): The LinkedIn profile to process.
Returns:
LinkedinProfileClassificationResults: The comprehensive classification results for the profile.
"""
# Create tasks for education classification
education_tasks = {
education: self._education_classifier.classify_education(profile, education)
for education in profile.educations
}
# Create tasks for work experience classification
work_experience_tasks = {
position: self._work_experience_classifier.classify_work_experience(profile, position)
for position in profile.positions
}
# Wait for all education and work experience classifications to complete
education_results = await asyncio.gather(*education_tasks.values())
work_experience_results = await asyncio.gather(*work_experience_tasks.values())
# Create ClassifiedEducation objects in the original order
classified_educations = [
ClassifiedEducation(education=education, classification=classification)
for education, classification in zip(profile.educations, education_results)
]
# Process work experiences and create ClassifiedWorkExperience objects
classified_work_experiences = []
for position, work_classification in zip(profile.positions, work_experience_results):
classified_work_experience = ClassifiedWorkExperience(
position=position, work_experience_classification=work_classification
)
if work_classification.primary_job_type not in {
work_classification.primary_job_type.INTERNSHIP,
work_classification.primary_job_type.EXTRACURRICULAR,
}:
if work_classification.secondary_job_type == work_classification.secondary_job_type.INVESTMENT_BANKING:
ib_classification = await self._investment_banking_classifier.classify_investment_banking_group(
profile, position
)
classified_work_experience.investment_banking_classification = ib_classification
if (
work_classification.secondary_job_type == work_classification.secondary_job_type.INVESTING
and not work_classification.primary_job_type
== work_classification.primary_job_type.ADVISORY_BOARD_INVESTOR
):
asset_class_task = (
self._investing_focus_asset_class_classifier.classify_investing_focus_asset_class(
profile, position
)
)
sector_task = self._investing_focus_sector_classifier.classify_investing_focus_sector(
profile, position
)
asset_class_result, sector_result = await asyncio.gather(asset_class_task, sector_task)
classified_work_experience.investing_focus_asset_class_classification = asset_class_result
classified_work_experience.investing_focus_sector_classification = sector_result
classified_work_experiences.append(classified_work_experience)
experience_by_job_type = self.estimate_full_time_experience_by_secondary_job_type(classified_work_experiences)
total_work_experience = sum(experience_by_job_type.values())
return LinkedinProfileClassificationResults(
classified_educations=classified_educations,
classified_work_experiences=classified_work_experiences,
full_time_work_experience_years=total_work_experience,
full_time_work_experience_by_secondary=experience_by_job_type,
)
async def main() -> None:
"""
Main function to demonstrate the usage of VspDataEnrichment.
This function loads a sample LinkedIn profile from a JSON file,
processes it using the VspDataEnrichment class, and prints the results.
"""
import json
# Load a sample LinkedIn profile
with open("tests/test_data/sample_profiles/eric_armagost.json") as f:
profile_data = json.load(f)
profile = LinkedinProfile.model_validate(profile_data)
# Create an instance of VspDataEnrichment and process the profile
vsp_enrichment = VspDataEnrichment()
results = await vsp_enrichment.process_linkedin_profile(profile)
# Print the results
print(results.model_dump_json(indent=2))
if __name__ == "__main__":
asyncio.run(main())