File size: 13,604 Bytes
ad2d836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145d03b
 
819725d
 
 
 
145d03b
 
 
 
 
819725d
 
145d03b
 
 
85c3ddd
 
 
 
 
 
 
 
145d03b
 
 
 
 
 
 
 
ad2d836
 
 
 
 
 
 
145d03b
 
 
 
 
 
ad2d836
 
 
 
 
518f864
 
 
 
 
 
 
 
 
 
 
ad2d836
145d03b
 
 
 
85c3ddd
 
145d03b
 
 
ad2d836
 
 
 
 
518f864
 
ad2d836
145d03b
 
 
819725d
 
145d03b
 
ad2d836
145d03b
ad2d836
145d03b
ad2d836
 
145d03b
ad2d836
 
 
 
518f864
 
 
ad2d836
145d03b
85c3ddd
518f864
ad2d836
 
 
 
 
 
 
819725d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad2d836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
819725d
 
 
 
 
518f864
 
 
 
ad2d836
 
 
 
 
 
 
 
 
 
 
 
819725d
 
ad2d836
819725d
 
 
 
ad2d836
145d03b
 
 
 
ad2d836
 
 
 
145d03b
ad2d836
 
 
49b13c6
ad2d836
 
 
 
 
 
 
 
145d03b
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
"""
main.py

This module serves as the main executable file entrypoint for the VSP Data Enrichment project.
It provides functionality to process LinkedIn profiles and classify various aspects of a person's
educational and professional background.

The main class, VspDataEnrichment, encapsulates all the necessary classifiers and methods
to perform a comprehensive analysis of a LinkedIn profile.

Usage:
    from vsp.app.main import VspDataEnrichment

    vsp_enrichment = VspDataEnrichment()
    results = await vsp_enrichment.process_linkedin_profile(linkedin_profile)

"""

import asyncio
import calendar
from collections import defaultdict
from datetime import date
from typing import List, Mapping, Sequence

from pydantic import BaseModel, Field

from vsp.app.classifiers.education_classifier import EducationClassification, EducationClassifier
from vsp.app.classifiers.work_experience.general_work_experience_classifier import (
    PrimaryJobType,
    SecondaryJobType,
    WorkExperienceClassification,
    WorkExperienceClassifier,
)
from vsp.app.classifiers.work_experience.investing_focus_asset_class_classifier import (
    InvestingFocusAssetClassClassification,
    InvestingFocusAssetClassClassifier,
)
from vsp.app.classifiers.work_experience.investing_focus_sector_classifier import (
    InvestingFocusSectorClassification,
    InvestingFocusSectorClassifier,
)
from vsp.app.classifiers.work_experience.investment_banking_group_classifier import (
    InvestmentBankingGroupClassification,
    InvestmentBankingGroupClassifier,
)
from vsp.app.model.linkedin.linkedin_models import Education, LinkedinProfile, Position


class ClassifiedEducation(BaseModel):
    """
    Represents a classified education item from a LinkedIn profile.

    Attributes:
        education (Education): The original education item from the LinkedIn profile.
        classification (EducationClassification): The classification results for the education item.
    """

    education: Education
    classification: EducationClassification


class ClassifiedWorkExperience(BaseModel):
    """
    Represents a classified work experience item from a LinkedIn profile.

    Attributes:
        position (Position): The original position item from the LinkedIn profile.
        work_experience_classification (WorkExperienceClassification): The general classification results
        for the work experience.
        investment_banking_classification (
            InvestmentBankingGroupClassification | None
        ): The investment banking classification results, if applicable.
        investing_focus_asset_class_classification (
            InvestingFocusAssetClassClassification | None
        ): The investing focus asset class classification results, if applicable.
        investing_focus_sector_classification (
            InvestingFocusSectorClassification | None
        ): The investing focus sector classification results, if applicable.
    """

    position: Position
    work_experience_classification: WorkExperienceClassification
    investment_banking_classification: InvestmentBankingGroupClassification | None = None
    investing_focus_asset_class_classification: InvestingFocusAssetClassClassification | None = None
    investing_focus_sector_classification: InvestingFocusSectorClassification | None = None


class LinkedinProfileClassificationResults(BaseModel):
    """
    Represents the complete classification results for a LinkedIn profile.

    Attributes:
        classified_educations (Sequence[ClassifiedEducation]): A sequence of classified education items.
        classified_work_experiences (Sequence[ClassifiedWorkExperience]): A sequence of classified work
        experience items.
    """

    classified_educations: Sequence[ClassifiedEducation] = Field(default_factory=list)
    classified_work_experiences: Sequence[ClassifiedWorkExperience] = Field(default_factory=list)
    full_time_work_experience_years: float = Field(default=0.0)
    full_time_work_experience_by_secondary: Mapping[SecondaryJobType, float] = Field(default_factory=dict)


class VspDataEnrichment:
    """
    Main class for the VSP Data Enrichment project.

    This class encapsulates all the necessary classifiers and methods to process
    and enrich LinkedIn profile data with various classifications.

    Attributes:
        education_classifier (EducationClassifier): Classifier for education items.
        work_experience_classifier (WorkExperienceClassifier): Classifier for general work experiences.
        investment_banking_classifier (InvestmentBankingGroupClassifier): Classifier for investment banking groups.
        investing_focus_asset_class_classifier (
            InvestingFocusAssetClassClassifier
        ): Classifier for investing focus asset classes.
        investing_focus_sector_classifier (InvestingFocusSectorClassifier): Classifier for investing focus sectors.
    """

    def __init__(self) -> None:
        """Initialize the VspDataEnrichment class with all required classifiers."""
        self._education_classifier = EducationClassifier()
        self._work_experience_classifier = WorkExperienceClassifier()
        self._investment_banking_classifier = InvestmentBankingGroupClassifier()
        self._investing_focus_asset_class_classifier = InvestingFocusAssetClassClassifier()
        self._investing_focus_sector_classifier = InvestingFocusSectorClassifier()

    def estimate_full_time_experience_by_secondary_job_type(
        self, classified_work_experiences: List[ClassifiedWorkExperience]
    ) -> Mapping[SecondaryJobType, float]:
        # Define current date
        current_date = date(2024, 9, 18)

        # List to store all events (start or end of intervals)
        events = []

        # Set to store all observed SecondaryJobTypes
        observed_secondary_job_types = set()

        for cwe in classified_work_experiences:
            classification = cwe.work_experience_classification.primary_job_type
            secondary_job_type = cwe.work_experience_classification.secondary_job_type

            if classification == PrimaryJobType.FULL_TIME and secondary_job_type:
                # Normalize start date
                start = cwe.position.start
                if not start or not start.year:
                    continue
                start_year = start.year
                start_month = start.month if start.month else 1
                start_day = start.day if start.day else 1
                start_date = date(start_year, start_month, start_day)

                # Normalize end date
                end = cwe.position.end
                if end is None:
                    end_date = current_date
                else:
                    if not end.year:
                        continue
                    end_year = end.year
                    end_month = end.month if end.month else 12
                    if end.day:
                        end_day = end.day
                    else:
                        # Get last day of the month
                        _, end_day = calendar.monthrange(end_year, end_month)
                    end_date = date(end_year, end_month, end_day)

                if start_date > end_date:
                    continue  # Skip invalid intervals

                # Add events for sweep-line algorithm
                events.append((start_date, "start", secondary_job_type))
                events.append((end_date, "end", secondary_job_type))

                observed_secondary_job_types.add(secondary_job_type)

        # Sort events by date
        events.sort(key=lambda x: x[0])

        active_secondary_job_types = set()
        last_date = None
        durations = defaultdict(int)  # in days

        for event_date, event_type, secondary_job_type in events:
            if last_date is not None and event_date > last_date:
                interval_duration = (event_date - last_date).days
                # Distribute the interval_duration among active_secondary_job_types
                for active_type in active_secondary_job_types:
                    durations[active_type] += interval_duration

            if event_type == "start":
                active_secondary_job_types.add(secondary_job_type)
            elif event_type == "end":
                active_secondary_job_types.discard(secondary_job_type)

            last_date = event_date

        # Convert durations from days to years
        durations_in_years = {stype: round(days / 365.25, 2) for stype, days in durations.items()}

        return durations_in_years

    async def process_linkedin_profile(self, profile: LinkedinProfile) -> LinkedinProfileClassificationResults:
        """
        Process a LinkedIn profile and classify its education and work experiences.

        This method maintains the original order of educations and work experiences
        from the input profile while performing asynchronous classification tasks.

        Args:
            profile (LinkedinProfile): The LinkedIn profile to process.

        Returns:
            LinkedinProfileClassificationResults: The comprehensive classification results for the profile.
        """
        # Create tasks for education classification
        education_tasks = {
            education: self._education_classifier.classify_education(profile, education)
            for education in profile.educations
        }

        # Create tasks for work experience classification
        work_experience_tasks = {
            position: self._work_experience_classifier.classify_work_experience(profile, position)
            for position in profile.positions
        }

        # Wait for all education and work experience classifications to complete
        education_results = await asyncio.gather(*education_tasks.values())
        work_experience_results = await asyncio.gather(*work_experience_tasks.values())

        # Create ClassifiedEducation objects in the original order
        classified_educations = [
            ClassifiedEducation(education=education, classification=classification)
            for education, classification in zip(profile.educations, education_results)
        ]

        # Process work experiences and create ClassifiedWorkExperience objects
        classified_work_experiences = []
        for position, work_classification in zip(profile.positions, work_experience_results):
            classified_work_experience = ClassifiedWorkExperience(
                position=position, work_experience_classification=work_classification
            )

            if work_classification.primary_job_type not in {
                work_classification.primary_job_type.INTERNSHIP,
                work_classification.primary_job_type.EXTRACURRICULAR,
            }:
                if work_classification.secondary_job_type == work_classification.secondary_job_type.INVESTMENT_BANKING:
                    ib_classification = await self._investment_banking_classifier.classify_investment_banking_group(
                        profile, position
                    )
                    classified_work_experience.investment_banking_classification = ib_classification

                if (
                    work_classification.secondary_job_type == work_classification.secondary_job_type.INVESTING
                    and not work_classification.primary_job_type
                    == work_classification.primary_job_type.ADVISORY_BOARD_INVESTOR
                ):
                    asset_class_task = (
                        self._investing_focus_asset_class_classifier.classify_investing_focus_asset_class(
                            profile, position
                        )
                    )
                    sector_task = self._investing_focus_sector_classifier.classify_investing_focus_sector(
                        profile, position
                    )

                    asset_class_result, sector_result = await asyncio.gather(asset_class_task, sector_task)

                    classified_work_experience.investing_focus_asset_class_classification = asset_class_result
                    classified_work_experience.investing_focus_sector_classification = sector_result

            classified_work_experiences.append(classified_work_experience)

        experience_by_job_type = self.estimate_full_time_experience_by_secondary_job_type(classified_work_experiences)
        total_work_experience = sum(experience_by_job_type.values())
        return LinkedinProfileClassificationResults(
            classified_educations=classified_educations,
            classified_work_experiences=classified_work_experiences,
            full_time_work_experience_years=total_work_experience,
            full_time_work_experience_by_secondary=experience_by_job_type,
        )


async def main() -> None:
    """
    Main function to demonstrate the usage of VspDataEnrichment.

    This function loads a sample LinkedIn profile from a JSON file,
    processes it using the VspDataEnrichment class, and prints the results.
    """
    import json

    # Load a sample LinkedIn profile
    with open("tests/test_data/sample_profiles/eric_armagost.json") as f:
        profile_data = json.load(f)
        profile = LinkedinProfile.model_validate(profile_data)

    # Create an instance of VspDataEnrichment and process the profile
    vsp_enrichment = VspDataEnrichment()
    results = await vsp_enrichment.process_linkedin_profile(profile)

    # Print the results
    print(results.model_dump_json(indent=2))


if __name__ == "__main__":
    asyncio.run(main())