File size: 13,604 Bytes
ad2d836 145d03b 819725d 145d03b 819725d 145d03b 85c3ddd 145d03b ad2d836 145d03b ad2d836 518f864 ad2d836 145d03b 85c3ddd 145d03b ad2d836 518f864 ad2d836 145d03b 819725d 145d03b ad2d836 145d03b ad2d836 145d03b ad2d836 145d03b ad2d836 518f864 ad2d836 145d03b 85c3ddd 518f864 ad2d836 819725d ad2d836 819725d 518f864 ad2d836 819725d ad2d836 819725d ad2d836 145d03b ad2d836 145d03b ad2d836 49b13c6 ad2d836 145d03b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 |
"""
main.py
This module serves as the main executable file entrypoint for the VSP Data Enrichment project.
It provides functionality to process LinkedIn profiles and classify various aspects of a person's
educational and professional background.
The main class, VspDataEnrichment, encapsulates all the necessary classifiers and methods
to perform a comprehensive analysis of a LinkedIn profile.
Usage:
from vsp.app.main import VspDataEnrichment
vsp_enrichment = VspDataEnrichment()
results = await vsp_enrichment.process_linkedin_profile(linkedin_profile)
"""
import asyncio
import calendar
from collections import defaultdict
from datetime import date
from typing import List, Mapping, Sequence
from pydantic import BaseModel, Field
from vsp.app.classifiers.education_classifier import EducationClassification, EducationClassifier
from vsp.app.classifiers.work_experience.general_work_experience_classifier import (
PrimaryJobType,
SecondaryJobType,
WorkExperienceClassification,
WorkExperienceClassifier,
)
from vsp.app.classifiers.work_experience.investing_focus_asset_class_classifier import (
InvestingFocusAssetClassClassification,
InvestingFocusAssetClassClassifier,
)
from vsp.app.classifiers.work_experience.investing_focus_sector_classifier import (
InvestingFocusSectorClassification,
InvestingFocusSectorClassifier,
)
from vsp.app.classifiers.work_experience.investment_banking_group_classifier import (
InvestmentBankingGroupClassification,
InvestmentBankingGroupClassifier,
)
from vsp.app.model.linkedin.linkedin_models import Education, LinkedinProfile, Position
class ClassifiedEducation(BaseModel):
"""
Represents a classified education item from a LinkedIn profile.
Attributes:
education (Education): The original education item from the LinkedIn profile.
classification (EducationClassification): The classification results for the education item.
"""
education: Education
classification: EducationClassification
class ClassifiedWorkExperience(BaseModel):
"""
Represents a classified work experience item from a LinkedIn profile.
Attributes:
position (Position): The original position item from the LinkedIn profile.
work_experience_classification (WorkExperienceClassification): The general classification results
for the work experience.
investment_banking_classification (
InvestmentBankingGroupClassification | None
): The investment banking classification results, if applicable.
investing_focus_asset_class_classification (
InvestingFocusAssetClassClassification | None
): The investing focus asset class classification results, if applicable.
investing_focus_sector_classification (
InvestingFocusSectorClassification | None
): The investing focus sector classification results, if applicable.
"""
position: Position
work_experience_classification: WorkExperienceClassification
investment_banking_classification: InvestmentBankingGroupClassification | None = None
investing_focus_asset_class_classification: InvestingFocusAssetClassClassification | None = None
investing_focus_sector_classification: InvestingFocusSectorClassification | None = None
class LinkedinProfileClassificationResults(BaseModel):
"""
Represents the complete classification results for a LinkedIn profile.
Attributes:
classified_educations (Sequence[ClassifiedEducation]): A sequence of classified education items.
classified_work_experiences (Sequence[ClassifiedWorkExperience]): A sequence of classified work
experience items.
"""
classified_educations: Sequence[ClassifiedEducation] = Field(default_factory=list)
classified_work_experiences: Sequence[ClassifiedWorkExperience] = Field(default_factory=list)
full_time_work_experience_years: float = Field(default=0.0)
full_time_work_experience_by_secondary: Mapping[SecondaryJobType, float] = Field(default_factory=dict)
class VspDataEnrichment:
"""
Main class for the VSP Data Enrichment project.
This class encapsulates all the necessary classifiers and methods to process
and enrich LinkedIn profile data with various classifications.
Attributes:
education_classifier (EducationClassifier): Classifier for education items.
work_experience_classifier (WorkExperienceClassifier): Classifier for general work experiences.
investment_banking_classifier (InvestmentBankingGroupClassifier): Classifier for investment banking groups.
investing_focus_asset_class_classifier (
InvestingFocusAssetClassClassifier
): Classifier for investing focus asset classes.
investing_focus_sector_classifier (InvestingFocusSectorClassifier): Classifier for investing focus sectors.
"""
def __init__(self) -> None:
"""Initialize the VspDataEnrichment class with all required classifiers."""
self._education_classifier = EducationClassifier()
self._work_experience_classifier = WorkExperienceClassifier()
self._investment_banking_classifier = InvestmentBankingGroupClassifier()
self._investing_focus_asset_class_classifier = InvestingFocusAssetClassClassifier()
self._investing_focus_sector_classifier = InvestingFocusSectorClassifier()
def estimate_full_time_experience_by_secondary_job_type(
self, classified_work_experiences: List[ClassifiedWorkExperience]
) -> Mapping[SecondaryJobType, float]:
# Define current date
current_date = date(2024, 9, 18)
# List to store all events (start or end of intervals)
events = []
# Set to store all observed SecondaryJobTypes
observed_secondary_job_types = set()
for cwe in classified_work_experiences:
classification = cwe.work_experience_classification.primary_job_type
secondary_job_type = cwe.work_experience_classification.secondary_job_type
if classification == PrimaryJobType.FULL_TIME and secondary_job_type:
# Normalize start date
start = cwe.position.start
if not start or not start.year:
continue
start_year = start.year
start_month = start.month if start.month else 1
start_day = start.day if start.day else 1
start_date = date(start_year, start_month, start_day)
# Normalize end date
end = cwe.position.end
if end is None:
end_date = current_date
else:
if not end.year:
continue
end_year = end.year
end_month = end.month if end.month else 12
if end.day:
end_day = end.day
else:
# Get last day of the month
_, end_day = calendar.monthrange(end_year, end_month)
end_date = date(end_year, end_month, end_day)
if start_date > end_date:
continue # Skip invalid intervals
# Add events for sweep-line algorithm
events.append((start_date, "start", secondary_job_type))
events.append((end_date, "end", secondary_job_type))
observed_secondary_job_types.add(secondary_job_type)
# Sort events by date
events.sort(key=lambda x: x[0])
active_secondary_job_types = set()
last_date = None
durations = defaultdict(int) # in days
for event_date, event_type, secondary_job_type in events:
if last_date is not None and event_date > last_date:
interval_duration = (event_date - last_date).days
# Distribute the interval_duration among active_secondary_job_types
for active_type in active_secondary_job_types:
durations[active_type] += interval_duration
if event_type == "start":
active_secondary_job_types.add(secondary_job_type)
elif event_type == "end":
active_secondary_job_types.discard(secondary_job_type)
last_date = event_date
# Convert durations from days to years
durations_in_years = {stype: round(days / 365.25, 2) for stype, days in durations.items()}
return durations_in_years
async def process_linkedin_profile(self, profile: LinkedinProfile) -> LinkedinProfileClassificationResults:
"""
Process a LinkedIn profile and classify its education and work experiences.
This method maintains the original order of educations and work experiences
from the input profile while performing asynchronous classification tasks.
Args:
profile (LinkedinProfile): The LinkedIn profile to process.
Returns:
LinkedinProfileClassificationResults: The comprehensive classification results for the profile.
"""
# Create tasks for education classification
education_tasks = {
education: self._education_classifier.classify_education(profile, education)
for education in profile.educations
}
# Create tasks for work experience classification
work_experience_tasks = {
position: self._work_experience_classifier.classify_work_experience(profile, position)
for position in profile.positions
}
# Wait for all education and work experience classifications to complete
education_results = await asyncio.gather(*education_tasks.values())
work_experience_results = await asyncio.gather(*work_experience_tasks.values())
# Create ClassifiedEducation objects in the original order
classified_educations = [
ClassifiedEducation(education=education, classification=classification)
for education, classification in zip(profile.educations, education_results)
]
# Process work experiences and create ClassifiedWorkExperience objects
classified_work_experiences = []
for position, work_classification in zip(profile.positions, work_experience_results):
classified_work_experience = ClassifiedWorkExperience(
position=position, work_experience_classification=work_classification
)
if work_classification.primary_job_type not in {
work_classification.primary_job_type.INTERNSHIP,
work_classification.primary_job_type.EXTRACURRICULAR,
}:
if work_classification.secondary_job_type == work_classification.secondary_job_type.INVESTMENT_BANKING:
ib_classification = await self._investment_banking_classifier.classify_investment_banking_group(
profile, position
)
classified_work_experience.investment_banking_classification = ib_classification
if (
work_classification.secondary_job_type == work_classification.secondary_job_type.INVESTING
and not work_classification.primary_job_type
== work_classification.primary_job_type.ADVISORY_BOARD_INVESTOR
):
asset_class_task = (
self._investing_focus_asset_class_classifier.classify_investing_focus_asset_class(
profile, position
)
)
sector_task = self._investing_focus_sector_classifier.classify_investing_focus_sector(
profile, position
)
asset_class_result, sector_result = await asyncio.gather(asset_class_task, sector_task)
classified_work_experience.investing_focus_asset_class_classification = asset_class_result
classified_work_experience.investing_focus_sector_classification = sector_result
classified_work_experiences.append(classified_work_experience)
experience_by_job_type = self.estimate_full_time_experience_by_secondary_job_type(classified_work_experiences)
total_work_experience = sum(experience_by_job_type.values())
return LinkedinProfileClassificationResults(
classified_educations=classified_educations,
classified_work_experiences=classified_work_experiences,
full_time_work_experience_years=total_work_experience,
full_time_work_experience_by_secondary=experience_by_job_type,
)
async def main() -> None:
"""
Main function to demonstrate the usage of VspDataEnrichment.
This function loads a sample LinkedIn profile from a JSON file,
processes it using the VspDataEnrichment class, and prints the results.
"""
import json
# Load a sample LinkedIn profile
with open("tests/test_data/sample_profiles/eric_armagost.json") as f:
profile_data = json.load(f)
profile = LinkedinProfile.model_validate(profile_data)
# Create an instance of VspDataEnrichment and process the profile
vsp_enrichment = VspDataEnrichment()
results = await vsp_enrichment.process_linkedin_profile(profile)
# Print the results
print(results.model_dump_json(indent=2))
if __name__ == "__main__":
asyncio.run(main())
|