Spaces:

pquiggles
/

vsp-demo

Runtime error

File size: 13,604 Bytes

"""
main.py

This module serves as the main executable file entrypoint for the VSP Data Enrichment project.
It provides functionality to process LinkedIn profiles and classify various aspects of a person's
educational and professional background.

The main class, VspDataEnrichment, encapsulates all the necessary classifiers and methods
to perform a comprehensive analysis of a LinkedIn profile.

Usage:
    from vsp.app.main import VspDataEnrichment

    vsp_enrichment = VspDataEnrichment()
    results = await vsp_enrichment.process_linkedin_profile(linkedin_profile)

"""

import asyncio
import calendar
from collections import defaultdict
from datetime import date
from typing import List, Mapping, Sequence

from pydantic import BaseModel, Field

from vsp.app.classifiers.education_classifier import EducationClassification, EducationClassifier
from vsp.app.classifiers.work_experience.general_work_experience_classifier import (
    PrimaryJobType,
    SecondaryJobType,
    WorkExperienceClassification,
    WorkExperienceClassifier,
)
from vsp.app.classifiers.work_experience.investing_focus_asset_class_classifier import (
    InvestingFocusAssetClassClassification,
    InvestingFocusAssetClassClassifier,
)
from vsp.app.classifiers.work_experience.investing_focus_sector_classifier import (
    InvestingFocusSectorClassification,
    InvestingFocusSectorClassifier,
)
from vsp.app.classifiers.work_experience.investment_banking_group_classifier import (
    InvestmentBankingGroupClassification,
    InvestmentBankingGroupClassifier,
)
from vsp.app.model.linkedin.linkedin_models import Education, LinkedinProfile, Position


class ClassifiedEducation(BaseModel):
    """
    Represents a classified education item from a LinkedIn profile.

    Attributes:
        education (Education): The original education item from the LinkedIn profile.
        classification (EducationClassification): The classification results for the education item.
    """

    education: Education
    classification: EducationClassification


class ClassifiedWorkExperience(BaseModel):
    """
    Represents a classified work experience item from a LinkedIn profile.

    Attributes:
        position (Position): The original position item from the LinkedIn profile.
        work_experience_classification (WorkExperienceClassification): The general classification results
        for the work experience.
        investment_banking_classification (
            InvestmentBankingGroupClassification | None
        ): The investment banking classification results, if applicable.
        investing_focus_asset_class_classification (
            InvestingFocusAssetClassClassification | None
        ): The investing focus asset class classification results, if applicable.
        investing_focus_sector_classification (
            InvestingFocusSectorClassification | None
        ): The investing focus sector classification results, if applicable.
    """

    position: Position
    work_experience_classification: WorkExperienceClassification
    investment_banking_classification: InvestmentBankingGroupClassification | None = None
    investing_focus_asset_class_classification: InvestingFocusAssetClassClassification | None = None
    investing_focus_sector_classification: InvestingFocusSectorClassification | None = None


class LinkedinProfileClassificationResults(BaseModel):
    """
    Represents the complete classification results for a LinkedIn profile.

    Attributes:
        classified_educations (Sequence[ClassifiedEducation]): A sequence of classified education items.
        classified_work_experiences (Sequence[ClassifiedWorkExperience]): A sequence of classified work
        experience items.
    """

    classified_educations: Sequence[ClassifiedEducation] = Field(default_factory=list)
    classified_work_experiences: Sequence[ClassifiedWorkExperience] = Field(default_factory=list)
    full_time_work_experience_years: float = Field(default=0.0)
    full_time_work_experience_by_secondary: Mapping[SecondaryJobType, float] = Field(default_factory=dict)


class VspDataEnrichment:
    """
    Main class for the VSP Data Enrichment project.

    This class encapsulates all the necessary classifiers and methods to process
    and enrich LinkedIn profile data with various classifications.

    Attributes:
        education_classifier (EducationClassifier): Classifier for education items.
        work_experience_classifier (WorkExperienceClassifier): Classifier for general work experiences.
        investment_banking_classifier (InvestmentBankingGroupClassifier): Classifier for investment banking groups.
        investing_focus_asset_class_classifier (
            InvestingFocusAssetClassClassifier
        ): Classifier for investing focus asset classes.
        investing_focus_sector_classifier (InvestingFocusSectorClassifier): Classifier for investing focus sectors.
    """

    def __init__(self) -> None:
        """Initialize the VspDataEnrichment class with all required classifiers."""
        self._education_classifier = EducationClassifier()
        self._work_experience_classifier = WorkExperienceClassifier()
        self._investment_banking_classifier = InvestmentBankingGroupClassifier()
        self._investing_focus_asset_class_classifier = InvestingFocusAssetClassClassifier()
        self._investing_focus_sector_classifier = InvestingFocusSectorClassifier()

    def estimate_full_time_experience_by_secondary_job_type(
        self, classified_work_experiences: List[ClassifiedWorkExperience]
    ) -> Mapping[SecondaryJobType, float]:
        # Define current date
        current_date = date(2024, 9, 18)

        # List to store all events (start or end of intervals)
        events = []

        # Set to store all observed SecondaryJobTypes
        observed_secondary_job_types = set()

        for cwe in classified_work_experiences:
            classification = cwe.work_experience_classification.primary_job_type
            secondary_job_type = cwe.work_experience_classification.secondary_job_type

            if classification == PrimaryJobType.FULL_TIME and secondary_job_type:
                # Normalize start date
                start = cwe.position.start
                if not start or not start.year:
                    continue
                start_year = start.year
                start_month = start.month if start.month else 1
                start_day = start.day if start.day else 1
                start_date = date(start_year, start_month, start_day)

                # Normalize end date
                end = cwe.position.end
                if end is None:
                    end_date = current_date
                else:
                    if not end.year:
                        continue
                    end_year = end.year
                    end_month = end.month if end.month else 12
                    if end.day:
                        end_day = end.day
                    else:
                        # Get last day of the month
                        _, end_day = calendar.monthrange(end_year, end_month)
                    end_date = date(end_year, end_month, end_day)

                if start_date > end_date:
                    continue  # Skip invalid intervals

                # Add events for sweep-line algorithm
                events.append((start_date, "start", secondary_job_type))
                events.append((end_date, "end", secondary_job_type))

                observed_secondary_job_types.add(secondary_job_type)

        # Sort events by date
        events.sort(key=lambda x: x[0])

        active_secondary_job_types = set()
        last_date = None
        durations = defaultdict(int)  # in days

        for event_date, event_type, secondary_job_type in events:
            if last_date is not None and event_date > last_date:
                interval_duration = (event_date - last_date).days
                # Distribute the interval_duration among active_secondary_job_types
                for active_type in active_secondary_job_types:
                    durations[active_type] += interval_duration

            if event_type == "start":
                active_secondary_job_types.add(secondary_job_type)
            elif event_type == "end":
                active_secondary_job_types.discard(secondary_job_type)

            last_date = event_date

        # Convert durations from days to years
        durations_in_years = {stype: round(days / 365.25, 2) for stype, days in durations.items()}

        return durations_in_years

    async def process_linkedin_profile(self, profile: LinkedinProfile) -> LinkedinProfileClassificationResults:
        """
        Process a LinkedIn profile and classify its education and work experiences.

        This method maintains the original order of educations and work experiences
        from the input profile while performing asynchronous classification tasks.

        Args:
            profile (LinkedinProfile): The LinkedIn profile to process.

        Returns:
            LinkedinProfileClassificationResults: The comprehensive classification results for the profile.
        """
        # Create tasks for education classification
        education_tasks = {
            education: self._education_classifier.classify_education(profile, education)
            for education in profile.educations
        }

        # Create tasks for work experience classification
        work_experience_tasks = {
            position: self._work_experience_classifier.classify_work_experience(profile, position)
            for position in profile.positions
        }

        # Wait for all education and work experience classifications to complete
        education_results = await asyncio.gather(*education_tasks.values())
        work_experience_results = await asyncio.gather(*work_experience_tasks.values())

        # Create ClassifiedEducation objects in the original order
        classified_educations = [
            ClassifiedEducation(education=education, classification=classification)
            for education, classification in zip(profile.educations, education_results)
        ]

        # Process work experiences and create ClassifiedWorkExperience objects
        classified_work_experiences = []
        for position, work_classification in zip(profile.positions, work_experience_results):
            classified_work_experience = ClassifiedWorkExperience(
                position=position, work_experience_classification=work_classification
            )

            if work_classification.primary_job_type not in {
                work_classification.primary_job_type.INTERNSHIP,
                work_classification.primary_job_type.EXTRACURRICULAR,
            }:
                if work_classification.secondary_job_type == work_classification.secondary_job_type.INVESTMENT_BANKING:
                    ib_classification = await self._investment_banking_classifier.classify_investment_banking_group(
                        profile, position
                    )
                    classified_work_experience.investment_banking_classification = ib_classification

                if (
                    work_classification.secondary_job_type == work_classification.secondary_job_type.INVESTING
                    and not work_classification.primary_job_type
                    == work_classification.primary_job_type.ADVISORY_BOARD_INVESTOR
                ):
                    asset_class_task = (
                        self._investing_focus_asset_class_classifier.classify_investing_focus_asset_class(
                            profile, position
                        )
                    )
                    sector_task = self._investing_focus_sector_classifier.classify_investing_focus_sector(
                        profile, position
                    )

                    asset_class_result, sector_result = await asyncio.gather(asset_class_task, sector_task)

                    classified_work_experience.investing_focus_asset_class_classification = asset_class_result
                    classified_work_experience.investing_focus_sector_classification = sector_result

            classified_work_experiences.append(classified_work_experience)

        experience_by_job_type = self.estimate_full_time_experience_by_secondary_job_type(classified_work_experiences)
        total_work_experience = sum(experience_by_job_type.values())
        return LinkedinProfileClassificationResults(
            classified_educations=classified_educations,
            classified_work_experiences=classified_work_experiences,
            full_time_work_experience_years=total_work_experience,
            full_time_work_experience_by_secondary=experience_by_job_type,
        )


async def main() -> None:
    """
    Main function to demonstrate the usage of VspDataEnrichment.

    This function loads a sample LinkedIn profile from a JSON file,
    processes it using the VspDataEnrichment class, and prints the results.
    """
    import json

    # Load a sample LinkedIn profile
    with open("tests/test_data/sample_profiles/eric_armagost.json") as f:
        profile_data = json.load(f)
        profile = LinkedinProfile.model_validate(profile_data)

    # Create an instance of VspDataEnrichment and process the profile
    vsp_enrichment = VspDataEnrichment()
    results = await vsp_enrichment.process_linkedin_profile(profile)

    # Print the results
    print(results.model_dump_json(indent=2))


if __name__ == "__main__":
    asyncio.run(main())