|
|
""" |
|
|
main.py |
|
|
|
|
|
This module serves as the main executable file entrypoint for the VSP Data Enrichment project. |
|
|
It provides functionality to process LinkedIn profiles and classify various aspects of a person's |
|
|
educational and professional background. |
|
|
|
|
|
The main class, VspDataEnrichment, encapsulates all the necessary classifiers and methods |
|
|
to perform a comprehensive analysis of a LinkedIn profile. |
|
|
|
|
|
Usage: |
|
|
from vsp.app.main import VspDataEnrichment |
|
|
|
|
|
vsp_enrichment = VspDataEnrichment() |
|
|
results = await vsp_enrichment.process_linkedin_profile(linkedin_profile) |
|
|
|
|
|
""" |
|
|
|
|
|
import asyncio |
|
|
import calendar |
|
|
from collections import defaultdict |
|
|
from datetime import date |
|
|
from typing import List, Mapping, Sequence |
|
|
|
|
|
from pydantic import BaseModel, Field |
|
|
|
|
|
from vsp.app.classifiers.education_classifier import EducationClassification, EducationClassifier |
|
|
from vsp.app.classifiers.work_experience.general_work_experience_classifier import ( |
|
|
PrimaryJobType, |
|
|
SecondaryJobType, |
|
|
WorkExperienceClassification, |
|
|
WorkExperienceClassifier, |
|
|
) |
|
|
from vsp.app.classifiers.work_experience.investing_focus_asset_class_classifier import ( |
|
|
InvestingFocusAssetClassClassification, |
|
|
InvestingFocusAssetClassClassifier, |
|
|
) |
|
|
from vsp.app.classifiers.work_experience.investing_focus_sector_classifier import ( |
|
|
InvestingFocusSectorClassification, |
|
|
InvestingFocusSectorClassifier, |
|
|
) |
|
|
from vsp.app.classifiers.work_experience.investment_banking_group_classifier import ( |
|
|
InvestmentBankingGroupClassification, |
|
|
InvestmentBankingGroupClassifier, |
|
|
) |
|
|
from vsp.app.model.linkedin.linkedin_models import Education, LinkedinProfile, Position |
|
|
|
|
|
|
|
|
class ClassifiedEducation(BaseModel): |
|
|
""" |
|
|
Represents a classified education item from a LinkedIn profile. |
|
|
|
|
|
Attributes: |
|
|
education (Education): The original education item from the LinkedIn profile. |
|
|
classification (EducationClassification): The classification results for the education item. |
|
|
""" |
|
|
|
|
|
education: Education |
|
|
classification: EducationClassification |
|
|
|
|
|
|
|
|
class ClassifiedWorkExperience(BaseModel): |
|
|
""" |
|
|
Represents a classified work experience item from a LinkedIn profile. |
|
|
|
|
|
Attributes: |
|
|
position (Position): The original position item from the LinkedIn profile. |
|
|
work_experience_classification (WorkExperienceClassification): The general classification results |
|
|
for the work experience. |
|
|
investment_banking_classification ( |
|
|
InvestmentBankingGroupClassification | None |
|
|
): The investment banking classification results, if applicable. |
|
|
investing_focus_asset_class_classification ( |
|
|
InvestingFocusAssetClassClassification | None |
|
|
): The investing focus asset class classification results, if applicable. |
|
|
investing_focus_sector_classification ( |
|
|
InvestingFocusSectorClassification | None |
|
|
): The investing focus sector classification results, if applicable. |
|
|
""" |
|
|
|
|
|
position: Position |
|
|
work_experience_classification: WorkExperienceClassification |
|
|
investment_banking_classification: InvestmentBankingGroupClassification | None = None |
|
|
investing_focus_asset_class_classification: InvestingFocusAssetClassClassification | None = None |
|
|
investing_focus_sector_classification: InvestingFocusSectorClassification | None = None |
|
|
|
|
|
|
|
|
class LinkedinProfileClassificationResults(BaseModel): |
|
|
""" |
|
|
Represents the complete classification results for a LinkedIn profile. |
|
|
|
|
|
Attributes: |
|
|
classified_educations (Sequence[ClassifiedEducation]): A sequence of classified education items. |
|
|
classified_work_experiences (Sequence[ClassifiedWorkExperience]): A sequence of classified work |
|
|
experience items. |
|
|
""" |
|
|
|
|
|
classified_educations: Sequence[ClassifiedEducation] = Field(default_factory=list) |
|
|
classified_work_experiences: Sequence[ClassifiedWorkExperience] = Field(default_factory=list) |
|
|
full_time_work_experience_years: float = Field(default=0.0) |
|
|
full_time_work_experience_by_secondary: Mapping[SecondaryJobType, float] = Field(default_factory=dict) |
|
|
|
|
|
|
|
|
class VspDataEnrichment: |
|
|
""" |
|
|
Main class for the VSP Data Enrichment project. |
|
|
|
|
|
This class encapsulates all the necessary classifiers and methods to process |
|
|
and enrich LinkedIn profile data with various classifications. |
|
|
|
|
|
Attributes: |
|
|
education_classifier (EducationClassifier): Classifier for education items. |
|
|
work_experience_classifier (WorkExperienceClassifier): Classifier for general work experiences. |
|
|
investment_banking_classifier (InvestmentBankingGroupClassifier): Classifier for investment banking groups. |
|
|
investing_focus_asset_class_classifier ( |
|
|
InvestingFocusAssetClassClassifier |
|
|
): Classifier for investing focus asset classes. |
|
|
investing_focus_sector_classifier (InvestingFocusSectorClassifier): Classifier for investing focus sectors. |
|
|
""" |
|
|
|
|
|
def __init__(self) -> None: |
|
|
"""Initialize the VspDataEnrichment class with all required classifiers.""" |
|
|
self._education_classifier = EducationClassifier() |
|
|
self._work_experience_classifier = WorkExperienceClassifier() |
|
|
self._investment_banking_classifier = InvestmentBankingGroupClassifier() |
|
|
self._investing_focus_asset_class_classifier = InvestingFocusAssetClassClassifier() |
|
|
self._investing_focus_sector_classifier = InvestingFocusSectorClassifier() |
|
|
|
|
|
def estimate_full_time_experience_by_secondary_job_type( |
|
|
self, classified_work_experiences: List[ClassifiedWorkExperience] |
|
|
) -> Mapping[SecondaryJobType, float]: |
|
|
|
|
|
current_date = date(2024, 9, 18) |
|
|
|
|
|
|
|
|
events = [] |
|
|
|
|
|
|
|
|
observed_secondary_job_types = set() |
|
|
|
|
|
for cwe in classified_work_experiences: |
|
|
classification = cwe.work_experience_classification.primary_job_type |
|
|
secondary_job_type = cwe.work_experience_classification.secondary_job_type |
|
|
|
|
|
if classification == PrimaryJobType.FULL_TIME and secondary_job_type: |
|
|
|
|
|
start = cwe.position.start |
|
|
if not start or not start.year: |
|
|
continue |
|
|
start_year = start.year |
|
|
start_month = start.month if start.month else 1 |
|
|
start_day = start.day if start.day else 1 |
|
|
start_date = date(start_year, start_month, start_day) |
|
|
|
|
|
|
|
|
end = cwe.position.end |
|
|
if end is None: |
|
|
end_date = current_date |
|
|
else: |
|
|
if not end.year: |
|
|
continue |
|
|
end_year = end.year |
|
|
end_month = end.month if end.month else 12 |
|
|
if end.day: |
|
|
end_day = end.day |
|
|
else: |
|
|
|
|
|
_, end_day = calendar.monthrange(end_year, end_month) |
|
|
end_date = date(end_year, end_month, end_day) |
|
|
|
|
|
if start_date > end_date: |
|
|
continue |
|
|
|
|
|
|
|
|
events.append((start_date, "start", secondary_job_type)) |
|
|
events.append((end_date, "end", secondary_job_type)) |
|
|
|
|
|
observed_secondary_job_types.add(secondary_job_type) |
|
|
|
|
|
|
|
|
events.sort(key=lambda x: x[0]) |
|
|
|
|
|
active_secondary_job_types = set() |
|
|
last_date = None |
|
|
durations = defaultdict(int) |
|
|
|
|
|
for event_date, event_type, secondary_job_type in events: |
|
|
if last_date is not None and event_date > last_date: |
|
|
interval_duration = (event_date - last_date).days |
|
|
|
|
|
for active_type in active_secondary_job_types: |
|
|
durations[active_type] += interval_duration |
|
|
|
|
|
if event_type == "start": |
|
|
active_secondary_job_types.add(secondary_job_type) |
|
|
elif event_type == "end": |
|
|
active_secondary_job_types.discard(secondary_job_type) |
|
|
|
|
|
last_date = event_date |
|
|
|
|
|
|
|
|
durations_in_years = {stype: round(days / 365.25, 2) for stype, days in durations.items()} |
|
|
|
|
|
return durations_in_years |
|
|
|
|
|
async def process_linkedin_profile(self, profile: LinkedinProfile) -> LinkedinProfileClassificationResults: |
|
|
""" |
|
|
Process a LinkedIn profile and classify its education and work experiences. |
|
|
|
|
|
This method maintains the original order of educations and work experiences |
|
|
from the input profile while performing asynchronous classification tasks. |
|
|
|
|
|
Args: |
|
|
profile (LinkedinProfile): The LinkedIn profile to process. |
|
|
|
|
|
Returns: |
|
|
LinkedinProfileClassificationResults: The comprehensive classification results for the profile. |
|
|
""" |
|
|
|
|
|
education_tasks = { |
|
|
education: self._education_classifier.classify_education(profile, education) |
|
|
for education in profile.educations |
|
|
} |
|
|
|
|
|
|
|
|
work_experience_tasks = { |
|
|
position: self._work_experience_classifier.classify_work_experience(profile, position) |
|
|
for position in profile.positions |
|
|
} |
|
|
|
|
|
|
|
|
education_results = await asyncio.gather(*education_tasks.values()) |
|
|
work_experience_results = await asyncio.gather(*work_experience_tasks.values()) |
|
|
|
|
|
|
|
|
classified_educations = [ |
|
|
ClassifiedEducation(education=education, classification=classification) |
|
|
for education, classification in zip(profile.educations, education_results) |
|
|
] |
|
|
|
|
|
|
|
|
classified_work_experiences = [] |
|
|
for position, work_classification in zip(profile.positions, work_experience_results): |
|
|
classified_work_experience = ClassifiedWorkExperience( |
|
|
position=position, work_experience_classification=work_classification |
|
|
) |
|
|
|
|
|
if work_classification.primary_job_type not in { |
|
|
work_classification.primary_job_type.INTERNSHIP, |
|
|
work_classification.primary_job_type.EXTRACURRICULAR, |
|
|
}: |
|
|
if work_classification.secondary_job_type == work_classification.secondary_job_type.INVESTMENT_BANKING: |
|
|
ib_classification = await self._investment_banking_classifier.classify_investment_banking_group( |
|
|
profile, position |
|
|
) |
|
|
classified_work_experience.investment_banking_classification = ib_classification |
|
|
|
|
|
if ( |
|
|
work_classification.secondary_job_type == work_classification.secondary_job_type.INVESTING |
|
|
and not work_classification.primary_job_type |
|
|
== work_classification.primary_job_type.ADVISORY_BOARD_INVESTOR |
|
|
): |
|
|
asset_class_task = ( |
|
|
self._investing_focus_asset_class_classifier.classify_investing_focus_asset_class( |
|
|
profile, position |
|
|
) |
|
|
) |
|
|
sector_task = self._investing_focus_sector_classifier.classify_investing_focus_sector( |
|
|
profile, position |
|
|
) |
|
|
|
|
|
asset_class_result, sector_result = await asyncio.gather(asset_class_task, sector_task) |
|
|
|
|
|
classified_work_experience.investing_focus_asset_class_classification = asset_class_result |
|
|
classified_work_experience.investing_focus_sector_classification = sector_result |
|
|
|
|
|
classified_work_experiences.append(classified_work_experience) |
|
|
|
|
|
experience_by_job_type = self.estimate_full_time_experience_by_secondary_job_type(classified_work_experiences) |
|
|
total_work_experience = sum(experience_by_job_type.values()) |
|
|
return LinkedinProfileClassificationResults( |
|
|
classified_educations=classified_educations, |
|
|
classified_work_experiences=classified_work_experiences, |
|
|
full_time_work_experience_years=total_work_experience, |
|
|
full_time_work_experience_by_secondary=experience_by_job_type, |
|
|
) |
|
|
|
|
|
|
|
|
async def main() -> None: |
|
|
""" |
|
|
Main function to demonstrate the usage of VspDataEnrichment. |
|
|
|
|
|
This function loads a sample LinkedIn profile from a JSON file, |
|
|
processes it using the VspDataEnrichment class, and prints the results. |
|
|
""" |
|
|
import json |
|
|
|
|
|
|
|
|
with open("tests/test_data/sample_profiles/eric_armagost.json") as f: |
|
|
profile_data = json.load(f) |
|
|
profile = LinkedinProfile.model_validate(profile_data) |
|
|
|
|
|
|
|
|
vsp_enrichment = VspDataEnrichment() |
|
|
results = await vsp_enrichment.process_linkedin_profile(profile) |
|
|
|
|
|
|
|
|
print(results.model_dump_json(indent=2)) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
asyncio.run(main()) |
|
|
|