|
|
""" |
|
|
Template analyzer for project page generation. |
|
|
Analyzes existing project page templates to understand structure and style. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import json |
|
|
import re |
|
|
from bs4 import BeautifulSoup |
|
|
from pathlib import Path |
|
|
import yaml |
|
|
from jinja2 import Environment, StrictUndefined |
|
|
|
|
|
class ProjectPageTemplateAnalyzer: |
|
|
"""Analyzes project page templates to extract structure and styling patterns.""" |
|
|
|
|
|
def __init__(self, template_dir="project_templates"): |
|
|
self.template_dir = Path(template_dir) |
|
|
self.template_dir.mkdir(exist_ok=True) |
|
|
self.templates = {} |
|
|
self.common_patterns = {} |
|
|
|
|
|
def analyze_html_template(self, html_file_path): |
|
|
""" |
|
|
Analyze an HTML template file to extract structure and styling. |
|
|
|
|
|
Args: |
|
|
html_file_path: Path to the HTML template file |
|
|
|
|
|
Returns: |
|
|
dict: Analysis results including structure, styling, and patterns |
|
|
""" |
|
|
try: |
|
|
with open(html_file_path, 'r', encoding='utf-8') as f: |
|
|
html_content = f.read() |
|
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
|
|
|
|
analysis = { |
|
|
'file_path': html_file_path, |
|
|
'structure': self._extract_structure(soup), |
|
|
'styling': self._extract_styling(soup), |
|
|
'sections': self._extract_sections(soup), |
|
|
'components': self._extract_components(soup), |
|
|
'meta_info': self._extract_meta_info(soup) |
|
|
} |
|
|
|
|
|
return analysis |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error analyzing template {html_file_path}: {e}") |
|
|
return None |
|
|
|
|
|
def _extract_structure(self, soup): |
|
|
"""Extract the overall structure of the HTML document.""" |
|
|
structure = { |
|
|
'doctype': soup.find('!DOCTYPE') is not None, |
|
|
'html_lang': soup.html.get('lang', 'en') if soup.html else 'en', |
|
|
'head_sections': [], |
|
|
'body_sections': [], |
|
|
'main_content': None, |
|
|
'navigation': None, |
|
|
'footer': None |
|
|
} |
|
|
|
|
|
|
|
|
if soup.head: |
|
|
for tag in soup.head.find_all(['meta', 'link', 'script', 'title']): |
|
|
structure['head_sections'].append({ |
|
|
'tag': tag.name, |
|
|
'attrs': dict(tag.attrs) |
|
|
}) |
|
|
|
|
|
|
|
|
if soup.body: |
|
|
for section in soup.body.find_all(['header', 'nav', 'main', 'section', 'article', 'aside', 'footer']): |
|
|
structure['body_sections'].append({ |
|
|
'tag': section.name, |
|
|
'id': section.get('id', ''), |
|
|
'class': section.get('class', []), |
|
|
'content_type': self._identify_content_type(section) |
|
|
}) |
|
|
|
|
|
return structure |
|
|
|
|
|
def _extract_styling(self, soup): |
|
|
"""Extract CSS styling information.""" |
|
|
styling = { |
|
|
'inline_styles': [], |
|
|
'external_css': [], |
|
|
'color_scheme': [], |
|
|
'typography': {}, |
|
|
'layout': {} |
|
|
} |
|
|
|
|
|
|
|
|
for tag in soup.find_all(style=True): |
|
|
styling['inline_styles'].append({ |
|
|
'tag': tag.name, |
|
|
'style': tag.get('style', '') |
|
|
}) |
|
|
|
|
|
|
|
|
for link in soup.find_all('link', rel='stylesheet'): |
|
|
styling['external_css'].append(link.get('href', '')) |
|
|
|
|
|
|
|
|
color_pattern = re.compile(r'#[0-9a-fA-F]{3,6}|rgb\([^)]+\)|rgba\([^)]+\)') |
|
|
for tag in soup.find_all(style=True): |
|
|
colors = color_pattern.findall(tag.get('style', '')) |
|
|
styling['color_scheme'].extend(colors) |
|
|
|
|
|
|
|
|
for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']): |
|
|
font_size = re.search(r'font-size:\s*([^;]+)', tag.get('style', '')) |
|
|
if font_size: |
|
|
styling['typography'][tag.name] = font_size.group(1) |
|
|
|
|
|
return styling |
|
|
|
|
|
def _extract_sections(self, soup): |
|
|
"""Extract content sections and their organization.""" |
|
|
sections = [] |
|
|
|
|
|
for section in soup.find_all(['section', 'article', 'div'], class_=True): |
|
|
section_info = { |
|
|
'tag': section.name, |
|
|
'id': section.get('id', ''), |
|
|
'classes': section.get('class', []), |
|
|
'content': self._extract_section_content(section), |
|
|
'images': self._extract_images(section), |
|
|
'tables': self._extract_tables(section) |
|
|
} |
|
|
sections.append(section_info) |
|
|
|
|
|
return sections |
|
|
|
|
|
def _extract_components(self, soup): |
|
|
"""Extract reusable components and their patterns.""" |
|
|
components = { |
|
|
'navigation': self._extract_navigation(soup), |
|
|
'hero_section': self._extract_hero_section(soup), |
|
|
'content_blocks': self._extract_content_blocks(soup), |
|
|
'image_galleries': self._extract_image_galleries(soup), |
|
|
'contact_forms': self._extract_contact_forms(soup) |
|
|
} |
|
|
|
|
|
return components |
|
|
|
|
|
def _extract_meta_info(self, soup): |
|
|
"""Extract meta information and SEO elements.""" |
|
|
meta_info = { |
|
|
'title': soup.title.string if soup.title else '', |
|
|
'meta_tags': [], |
|
|
'open_graph': {}, |
|
|
'twitter_cards': {} |
|
|
} |
|
|
|
|
|
for meta in soup.find_all('meta'): |
|
|
meta_info['meta_tags'].append({ |
|
|
'name': meta.get('name', ''), |
|
|
'content': meta.get('content', ''), |
|
|
'property': meta.get('property', '') |
|
|
}) |
|
|
|
|
|
|
|
|
if meta.get('property', '').startswith('og:'): |
|
|
meta_info['open_graph'][meta.get('property')] = meta.get('content', '') |
|
|
|
|
|
|
|
|
if meta.get('name', '').startswith('twitter:'): |
|
|
meta_info['twitter_cards'][meta.get('name')] = meta.get('content', '') |
|
|
|
|
|
return meta_info |
|
|
|
|
|
def _identify_content_type(self, element): |
|
|
"""Identify the type of content in an element.""" |
|
|
text = element.get_text().lower() |
|
|
|
|
|
if any(word in text for word in ['abstract', 'summary', 'overview']): |
|
|
return 'abstract' |
|
|
elif any(word in text for word in ['introduction', 'background']): |
|
|
return 'introduction' |
|
|
elif any(word in text for word in ['method', 'approach', 'methodology']): |
|
|
return 'methodology' |
|
|
elif any(word in text for word in ['result', 'experiment', 'evaluation']): |
|
|
return 'results' |
|
|
elif any(word in text for word in ['conclusion', 'discussion', 'future']): |
|
|
return 'conclusion' |
|
|
elif any(word in text for word in ['contact', 'author', 'team']): |
|
|
return 'contact' |
|
|
else: |
|
|
return 'general' |
|
|
|
|
|
def _extract_section_content(self, element): |
|
|
"""Extract text content from a section.""" |
|
|
content = { |
|
|
'headings': [], |
|
|
'paragraphs': [], |
|
|
'lists': [], |
|
|
'code_blocks': [] |
|
|
} |
|
|
|
|
|
for heading in element.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): |
|
|
content['headings'].append({ |
|
|
'level': int(heading.name[1]), |
|
|
'text': heading.get_text().strip() |
|
|
}) |
|
|
|
|
|
for p in element.find_all('p'): |
|
|
content['paragraphs'].append(p.get_text().strip()) |
|
|
|
|
|
for ul in element.find_all(['ul', 'ol']): |
|
|
items = [li.get_text().strip() for li in ul.find_all('li')] |
|
|
content['lists'].append({ |
|
|
'type': ul.name, |
|
|
'items': items |
|
|
}) |
|
|
|
|
|
for code in element.find_all(['code', 'pre']): |
|
|
content['code_blocks'].append({ |
|
|
'type': code.name, |
|
|
'content': code.get_text().strip() |
|
|
}) |
|
|
|
|
|
return content |
|
|
|
|
|
def _extract_images(self, element): |
|
|
"""Extract image information from an element.""" |
|
|
images = [] |
|
|
for img in element.find_all('img'): |
|
|
images.append({ |
|
|
'src': img.get('src', ''), |
|
|
'alt': img.get('alt', ''), |
|
|
'title': img.get('title', ''), |
|
|
'class': img.get('class', []) |
|
|
}) |
|
|
return images |
|
|
|
|
|
def _extract_tables(self, element): |
|
|
"""Extract table information from an element.""" |
|
|
tables = [] |
|
|
for table in element.find_all('table'): |
|
|
table_info = { |
|
|
'class': table.get('class', []), |
|
|
'headers': [], |
|
|
'rows': [] |
|
|
} |
|
|
|
|
|
|
|
|
for th in table.find_all('th'): |
|
|
table_info['headers'].append(th.get_text().strip()) |
|
|
|
|
|
|
|
|
for tr in table.find_all('tr'): |
|
|
row = [td.get_text().strip() for td in tr.find_all('td')] |
|
|
if row: |
|
|
table_info['rows'].append(row) |
|
|
|
|
|
tables.append(table_info) |
|
|
|
|
|
return tables |
|
|
|
|
|
def _extract_navigation(self, soup): |
|
|
"""Extract navigation structure.""" |
|
|
nav = soup.find('nav') |
|
|
if nav: |
|
|
return { |
|
|
'links': [a.get('href', '') for a in nav.find_all('a')], |
|
|
'texts': [a.get_text().strip() for a in nav.find_all('a')], |
|
|
'structure': self._extract_nav_structure(nav) |
|
|
} |
|
|
return None |
|
|
|
|
|
def _extract_nav_structure(self, nav_element): |
|
|
"""Extract the hierarchical structure of navigation.""" |
|
|
structure = [] |
|
|
for item in nav_element.find_all(['a', 'li'], recursive=False): |
|
|
if item.name == 'a': |
|
|
structure.append({ |
|
|
'type': 'link', |
|
|
'text': item.get_text().strip(), |
|
|
'href': item.get('href', '') |
|
|
}) |
|
|
elif item.name == 'li': |
|
|
sub_items = [] |
|
|
for sub_item in item.find_all('a'): |
|
|
sub_items.append({ |
|
|
'text': sub_item.get_text().strip(), |
|
|
'href': sub_item.get('href', '') |
|
|
}) |
|
|
structure.append({ |
|
|
'type': 'group', |
|
|
'items': sub_items |
|
|
}) |
|
|
return structure |
|
|
|
|
|
def _extract_hero_section(self, soup): |
|
|
"""Extract hero section information.""" |
|
|
hero = soup.find(['header', 'section'], class_=re.compile(r'hero|banner|intro')) |
|
|
if hero: |
|
|
return { |
|
|
'title': hero.find(['h1', 'h2']).get_text().strip() if hero.find(['h1', 'h2']) else '', |
|
|
'subtitle': hero.find(['h2', 'h3', 'p']).get_text().strip() if hero.find(['h2', 'h3', 'p']) else '', |
|
|
'background_image': hero.find('img').get('src', '') if hero.find('img') else '', |
|
|
'cta_buttons': [a.get_text().strip() for a in hero.find_all('a', class_=re.compile(r'btn|button'))] |
|
|
} |
|
|
return None |
|
|
|
|
|
def _extract_content_blocks(self, soup): |
|
|
"""Extract content block patterns.""" |
|
|
blocks = [] |
|
|
for block in soup.find_all(['div', 'section'], class_=re.compile(r'content|block|section')): |
|
|
blocks.append({ |
|
|
'classes': block.get('class', []), |
|
|
'content_type': self._identify_content_type(block), |
|
|
'has_images': bool(block.find('img')), |
|
|
'has_tables': bool(block.find('table')), |
|
|
'has_code': bool(block.find(['code', 'pre'])) |
|
|
}) |
|
|
return blocks |
|
|
|
|
|
def _extract_image_galleries(self, soup): |
|
|
"""Extract image gallery patterns.""" |
|
|
galleries = [] |
|
|
for gallery in soup.find_all(['div', 'section'], class_=re.compile(r'gallery|carousel|slider')): |
|
|
images = gallery.find_all('img') |
|
|
galleries.append({ |
|
|
'image_count': len(images), |
|
|
'layout': 'grid' if 'grid' in str(gallery.get('class', [])) else 'carousel', |
|
|
'images': [img.get('src', '') for img in images] |
|
|
}) |
|
|
return galleries |
|
|
|
|
|
def _extract_contact_forms(self, soup): |
|
|
"""Extract contact form patterns.""" |
|
|
forms = [] |
|
|
for form in soup.find_all('form'): |
|
|
form_info = { |
|
|
'action': form.get('action', ''), |
|
|
'method': form.get('method', 'get'), |
|
|
'fields': [] |
|
|
} |
|
|
|
|
|
for input_field in form.find_all(['input', 'textarea', 'select']): |
|
|
form_info['fields'].append({ |
|
|
'type': input_field.get('type', input_field.name), |
|
|
'name': input_field.get('name', ''), |
|
|
'placeholder': input_field.get('placeholder', ''), |
|
|
'required': input_field.get('required') is not None |
|
|
}) |
|
|
|
|
|
forms.append(form_info) |
|
|
|
|
|
return forms |
|
|
|
|
|
def analyze_multiple_templates(self, template_files): |
|
|
""" |
|
|
Analyze multiple template files and find common patterns. |
|
|
|
|
|
Args: |
|
|
template_files: List of template file paths |
|
|
|
|
|
Returns: |
|
|
dict: Analysis results with common patterns |
|
|
""" |
|
|
all_analyses = [] |
|
|
|
|
|
for template_file in template_files: |
|
|
analysis = self.analyze_html_template(template_file) |
|
|
if analysis: |
|
|
all_analyses.append(analysis) |
|
|
|
|
|
|
|
|
common_patterns = self._find_common_patterns(all_analyses) |
|
|
|
|
|
return { |
|
|
'individual_analyses': all_analyses, |
|
|
'common_patterns': common_patterns |
|
|
} |
|
|
|
|
|
def _find_common_patterns(self, analyses): |
|
|
"""Find common patterns across multiple template analyses.""" |
|
|
patterns = { |
|
|
'common_sections': [], |
|
|
'common_styles': [], |
|
|
'common_components': [], |
|
|
'color_schemes': [], |
|
|
'layout_patterns': [] |
|
|
} |
|
|
|
|
|
|
|
|
all_sections = [] |
|
|
for analysis in analyses: |
|
|
all_sections.extend(analysis['sections']) |
|
|
|
|
|
section_types = {} |
|
|
for section in all_sections: |
|
|
content_type = section.get('content_type', 'unknown') |
|
|
if content_type not in section_types: |
|
|
section_types[content_type] = 0 |
|
|
section_types[content_type] += 1 |
|
|
|
|
|
patterns['common_sections'] = [ |
|
|
section_type for section_type, count in section_types.items() |
|
|
if count > len(analyses) * 0.5 |
|
|
] |
|
|
|
|
|
|
|
|
all_colors = [] |
|
|
for analysis in analyses: |
|
|
all_colors.extend(analysis['styling']['color_scheme']) |
|
|
|
|
|
color_counts = {} |
|
|
for color in all_colors: |
|
|
if color not in color_counts: |
|
|
color_counts[color] = 0 |
|
|
color_counts[color] += 1 |
|
|
|
|
|
patterns['color_schemes'] = [ |
|
|
color for color, count in color_counts.items() |
|
|
if count > len(analyses) * 0.3 |
|
|
] |
|
|
|
|
|
return patterns |
|
|
|
|
|
def save_analysis(self, analysis, output_path): |
|
|
"""Save analysis results to a JSON file.""" |
|
|
try: |
|
|
with open(output_path, 'w') as f: |
|
|
json.dump(analysis, f, indent=2) |
|
|
print(f"Analysis saved to {output_path}") |
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"Error saving analysis: {e}") |
|
|
return False |