finalize data_models.py

This commit is contained in:
Sina Atalay 2024-02-02 19:31:07 +01:00
parent 605697c089
commit cbac4b9432
1 changed files with 246 additions and 211 deletions

View File

@ -1,8 +1,9 @@
""" """
This module contains all the necessary classes to store CV data. The YAML input file is This module contains all the necessary classes to store CV data. These classes are called
transformed into instances of these classes (i.e., the input file is read) with the data models. The YAML input file is transformed into instances of these classes (i.e.,
[`read_input_file`](utilities.md#read_input_file) function. RenderCV utilizes these the input file is read) with the [`read_input_file`](#read_input_file) function.
instances to generate a CV. These classes are called data models. RenderCV utilizes these instances to generate a LaTeX file which is then rendered into a
PDF file.
The data models are initialized with data validation to prevent unexpected bugs. During The data models are initialized with data validation to prevent unexpected bugs. During
the initialization, we ensure that everything is in the correct place and that the user the initialization, we ensure that everything is in the correct place and that the user
@ -20,20 +21,21 @@ import json
import re import re
import ssl import ssl
import time import time
import pathlib
import pydantic import pydantic
import pydantic_extra_types.phone_numbers as pydantic_phone_numbers import pydantic_extra_types.phone_numbers as pydantic_phone_numbers
import pydantic.functional_validators as pydantic_functional_validators
import strictyaml import strictyaml
from . import utilities from . import utilities
from .terminal_reporter import warning from .terminal_reporter import warning
from .themes.classic import ClassicThemeOptions from .themes.classic import ClassicThemeOptions
from .terminal_reporter import warning, error, information from .terminal_reporter import information, time_the_event_below
# Create a custom type called PastDate that accepts a string in YYYY-MM-DD format and # Create a custom type called RenderCVDate that accepts only strings in YYYY-MM-DD or
# returns a Date object. It also checks if the date is in the past. # YYYY-MM format:
# This type is used to validate the date fields in the data.
# See https://docs.pydantic.dev/2.5/concepts/types/#custom-types for more information # See https://docs.pydantic.dev/2.5/concepts/types/#custom-types for more information
# about custom types. # about custom types.
RenderCVDate = Annotated[ RenderCVDate = Annotated[
@ -44,7 +46,8 @@ RenderCVDate = Annotated[
def get_date_object(date: str | int) -> Date: def get_date_object(date: str | int) -> Date:
"""Parse a date string in YYYY-MM-DD, YYYY-MM, or YYYY format and return a """Parse a date string in YYYY-MM-DD, YYYY-MM, or YYYY format and return a
datetime.date object. datetime.date object. This function is used throughout the validation process of the
data models.
Args: Args:
date_string (str): The date string to parse. date_string (str): The date string to parse.
@ -71,10 +74,9 @@ def get_date_object(date: str | int) -> Date:
class RenderCVBaseModel(pydantic.BaseModel): class RenderCVBaseModel(pydantic.BaseModel):
""" """This class is the parent class of all the data models in RenderCV. It has only
This class is the parent class of all the data models in RenderCV. It has only one one difference from the default `pydantic.BaseModel`: It raises an error if an
difference from the default `pydantic.BaseModel`: It raises an error if an unknown unknown key is provided in the input file.
key is provided in the input file.
""" """
model_config = pydantic.ConfigDict(extra="forbid") model_config = pydantic.ConfigDict(extra="forbid")
@ -136,8 +138,7 @@ class EntryBase(RenderCVBaseModel):
@classmethod @classmethod
def check_dates(cls, model): def check_dates(cls, model):
""" """
Check if the dates are provided correctly and convert them to `Date` objects if Check if the dates are provided correctly and do the necessary adjustments.
they are provided in YYYY-MM-DD format.
""" """
date_is_provided = False date_is_provided = False
start_date_is_provided = False start_date_is_provided = False
@ -215,8 +216,7 @@ class EntryBase(RenderCVBaseModel):
Example: Example:
```python ```python
entry = dm.EntryBase(start_date=2020-10-11, end_date=2021-04-04) entry = dm.EntryBase(start_date=2020-10-11, end_date=2021-04-04).date_string
entry.date_string
``` ```
will return: will return:
`#!python "2020-10-11 to 2021-04-04"` `#!python "2020-10-11 to 2021-04-04"`
@ -226,26 +226,33 @@ class EntryBase(RenderCVBaseModel):
date_object = get_date_object(self.date) date_object = get_date_object(self.date)
date_string = utilities.format_date(date_object) date_string = utilities.format_date(date_object)
except ValueError: except ValueError:
# Then it is a custom date string (e.g., "My Custom Date")
date_string = str(self.date) date_string = str(self.date)
elif self.start_date is not None and self.end_date is not None: elif self.start_date is not None and self.end_date is not None:
if isinstance(self.start_date, int): if isinstance(self.start_date, int):
# Then it means only the year is provided
start_date = str(self.start_date) start_date = str(self.start_date)
else: else:
# Then it means start_date is either in YYYY-MM-DD or YYYY-MM format
date_object = get_date_object(self.start_date) date_object = get_date_object(self.start_date)
start_date = utilities.format_date(date_object) start_date = utilities.format_date(date_object)
if self.end_date == "present": if self.end_date == "present":
end_date = "present" end_date = "present"
elif isinstance(self.end_date, int): elif isinstance(self.end_date, int):
# Then it means only the year is provided
end_date = str(self.end_date) end_date = str(self.end_date)
else: else:
# Then it means end_date is either in YYYY-MM-DD or YYYY-MM format
date_object = get_date_object(self.end_date) date_object = get_date_object(self.end_date)
end_date = utilities.format_date(date_object) end_date = utilities.format_date(date_object)
date_string = f"{start_date} to {end_date}" date_string = f"{start_date} to {end_date}"
else: else:
# Neither date, start_date, nor end_date is provided, so return an empty
# string:
date_string = "" date_string = ""
return date_string return date_string
@ -259,8 +266,7 @@ class EntryBase(RenderCVBaseModel):
Example: Example:
```python ```python
entry = dm.EntryBase(start_date=2020-01-01, end_date=2020-04-20) entry = dm.EntryBase(start_date=2020-01-01, end_date=2020-04-20).time_span
entry.time_span
``` ```
will return: will return:
`#!python "4 months"` `#!python "4 months"`
@ -270,6 +276,8 @@ class EntryBase(RenderCVBaseModel):
date = self.date date = self.date
if date is not None or (start_date is None and end_date is None): if date is not None or (start_date is None and end_date is None):
# If only the date is provided, the time span is irrelevant. So, return an
# empty string.
return "" return ""
elif isinstance(start_date, int) or isinstance(end_date, int): elif isinstance(start_date, int) or isinstance(end_date, int):
@ -288,6 +296,8 @@ class EntryBase(RenderCVBaseModel):
return time_span_string return time_span_string
else: else:
# Then it means both start_date and end_date are in YYYY-MM-DD or YYYY-MM
# format.
end_date = get_date_object(end_date) # type: ignore end_date = get_date_object(end_date) # type: ignore
start_date = get_date_object(start_date) # type: ignore start_date = get_date_object(start_date) # type: ignore
@ -326,6 +336,7 @@ class EntryBase(RenderCVBaseModel):
""" """
url_text = None url_text = None
if self.url_text_input is not None: if self.url_text_input is not None:
# If the user provides a custom URL text, then use it.
url_text = self.url_text_input url_text = self.url_text_input
elif self.url is not None: elif self.url is not None:
url_text_dictionary = { url_text_dictionary = {
@ -399,7 +410,7 @@ class EducationEntry(EntryBase):
class PublicationEntry(RenderCVBaseModel): class PublicationEntry(RenderCVBaseModel):
"""THis class is the data model of `PublicationEntry`.""" """This class is the data model of `PublicationEntry`."""
title: str = pydantic.Field( title: str = pydantic.Field(
title="Title of the Publication", title="Title of the Publication",
@ -442,7 +453,8 @@ class PublicationEntry(RenderCVBaseModel):
@classmethod @classmethod
def check_doi(cls, doi: str) -> str: def check_doi(cls, doi: str) -> str:
"""Check if the DOI exists in the DOI System.""" """Check if the DOI exists in the DOI System."""
# see https://stackoverflow.com/a/60671292/18840665 # see https://stackoverflow.com/a/60671292/18840665 for the explanation of the
# next line:
ssl._create_default_https_context = ssl._create_unverified_context ssl._create_default_https_context = ssl._create_unverified_context
doi_url = f"http://doi.org/{doi}" doi_url = f"http://doi.org/{doi}"
@ -458,11 +470,13 @@ class PublicationEntry(RenderCVBaseModel):
@pydantic.computed_field @pydantic.computed_field
@cached_property @cached_property
def doi_url(self) -> str: def doi_url(self) -> str:
"""Return the URL of the DOI."""
return f"https://doi.org/{self.doi}" return f"https://doi.org/{self.doi}"
@pydantic.computed_field @pydantic.computed_field
@cached_property @cached_property
def date_string(self) -> str: def date_string(self) -> str:
"""Return the date string of the publication."""
if isinstance(self.date, int): if isinstance(self.date, int):
date_string = str(self.date) date_string = str(self.date)
elif isinstance(self.date, str): elif isinstance(self.date, str):
@ -477,7 +491,9 @@ class PublicationEntry(RenderCVBaseModel):
# ====================================================================================== # ======================================================================================
# Section models: ====================================================================== # Section models: ======================================================================
# ====================================================================================== # ======================================================================================
# Each section data model has a field called `entry_type` and a field called `entries`.
# Since the same pydantic.Field object is used in all of the section models, it is
# defined as a separate variable and used in all of the section models:
entry_type_field_of_section_model = pydantic.Field( entry_type_field_of_section_model = pydantic.Field(
title="Entry Type", title="Entry Type",
description="The type of the entries in the section.", description="The type of the entries in the section.",
@ -493,7 +509,7 @@ class SectionBase(RenderCVBaseModel):
because all of the section types have a common field called `title`. because all of the section types have a common field called `title`.
""" """
# title is excluded from the JSON schema because this will be written by RenderCV # Title is excluded from the JSON schema because this will be written by RenderCV
# depending on the key in the input file. # depending on the key in the input file.
title: Optional[str] = pydantic.Field(default=None, exclude=True) title: Optional[str] = pydantic.Field(default=None, exclude=True)
@ -540,8 +556,9 @@ class SectionWithTextEntries(SectionBase):
entries: list[str] = entries_field_of_section_model entries: list[str] = entries_field_of_section_model
# A custom type Section. It is a union of all the section types and the correct section # Create a custom type called Section:
# type is determined by the entry_type field. # It is a union of all the section types and the correct section type is determined by
# the entry_type field, thanks Pydantic's discriminator feature.
# See https://docs.pydantic.dev/2.5/concepts/fields/#discriminator for more information # See https://docs.pydantic.dev/2.5/concepts/fields/#discriminator for more information
# about discriminators. # about discriminators.
Section = Annotated[ Section = Annotated[
@ -562,9 +579,13 @@ Section = Annotated[
# RenderCV requires users to specify the entry type for each section in their CV in # RenderCV requires users to specify the entry type for each section in their CV in
# order to render the correct thing in the CV. However, for certain sections, specifying # order to render the correct thing in the CV. However, for certain sections, specifying
# the entry type can be redundant. To simplify this process for users, default entry # the entry type can be redundant (for example, for the "Education" section, the entry
# type is probably "EducationEntry"). To simplify this process for users, default entry
# types are stored in a dictionary for certain section titles so that users do not have # types are stored in a dictionary for certain section titles so that users do not have
# to specify them. # to specify them.
# If you have new section titles that you would like to add to this dictionary, please
# open an issue or pull request on GitHub.
default_entry_types_for_a_given_title: dict[ default_entry_types_for_a_given_title: dict[
str, str,
tuple[type[EducationEntry], type[SectionWithEducationEntries]] tuple[type[EducationEntry], type[SectionWithEducationEntries]]
@ -613,6 +634,7 @@ class SocialNetwork(RenderCVBaseModel):
@pydantic.model_validator(mode="after") @pydantic.model_validator(mode="after")
@classmethod @classmethod
def check_networks(cls, model): def check_networks(cls, model):
"""Check if the `SocialNetwork` is provided correctly."""
if model.network == "Mastodon": if model.network == "Mastodon":
if not model.username.startswith("@"): if not model.username.startswith("@"):
raise ValueError( raise ValueError(
@ -782,6 +804,11 @@ class CurriculumVitae(RenderCVBaseModel):
# ====================================================================================== # ======================================================================================
# ====================================================================================== # ======================================================================================
# Create a custom type called Design:
# It is a union of all the design options and the correct design option is determined by
# the theme field, thanks Pydantic's discriminator feature.
# See https://docs.pydantic.dev/2.5/concepts/fields/#discriminator for more information
# about discriminators.
Design = ClassicThemeOptions Design = ClassicThemeOptions
@ -799,11 +826,200 @@ class RenderCVDataModel(RenderCVBaseModel):
) )
def generate_json_schema(output_directory: str) -> str: def escape_latex_characters(sentence: str) -> str:
"""Generate the JSON schema of the data model and save it to a file. """Escape $\LaTeX$ characters in a string.
This function is called during the reading of the input file. Before the validation
process, each input field's special $\\LaTeX$ characters are escaped.
Example:
```python
escape_latex_characters("This is a # string.")
```
will return:
`#!python "This is a \\# string."`
"""
# Dictionary of escape characters:
escape_characters = {
"#": r"\#",
# "$": r"\$", # Don't escape $ as it is used for math mode
"%": r"\%",
"&": r"\&",
"~": r"\textasciitilde{}",
# "_": r"\_", # Don't escape _ as it is used for math mode
# "^": r"\textasciicircum{}", # Don't escape ^ as it is used for math mode
}
# Don't escape links as hyperref package will do it automatically:
# Find all the links in the sentence:
links = re.findall(r"\[.*?\]\(.*?\)", sentence)
# Replace the links with a placeholder:
for link in links:
sentence = sentence.replace(link, "!!-link-!!")
# Loop through the letters of the sentence and if you find an escape character,
# replace it with its LaTeX equivalent:
copy_of_the_sentence = sentence
for character in copy_of_the_sentence:
if character in escape_characters:
sentence = sentence.replace(character, escape_characters[character])
# Replace the links with the original links:
for link in links:
sentence = sentence.replace("!!-link-!!", link)
return sentence
def markdown_to_latex(markdown_string: str) -> str:
"""Convert a markdown string to LaTeX.
This function is called during the reading of the input file. Before the validation
process, each input field is converted from markdown to LaTeX.
Example:
```python
markdown_to_latex("This is a **bold** text with an [*italic link*](https://google.com).")
```
will return:
`#!pytjon "This is a \\textbf{bold} text with a \\href{https://google.com}{\\textit{link}}."`
Args: Args:
output_directory (str): The output directory to save the schema. markdown_string (str): The markdown string to convert.
Returns:
str: The LaTeX string.
"""
# convert links
links = re.findall(r"\[([^\]\[]*)\]\((.*?)\)", markdown_string)
if links is not None:
for link in links:
link_text = link[0]
link_url = link[1]
old_link_string = f"[{link_text}]({link_url})"
new_link_string = "\\href{" + link_url + "}{" + link_text + "}"
markdown_string = markdown_string.replace(old_link_string, new_link_string)
# convert bold
bolds = re.findall(r"\*\*([^\*]*)\*\*", markdown_string)
if bolds is not None:
for bold_text in bolds:
old_bold_text = f"**{bold_text}**"
new_bold_text = "\\textbf{" + bold_text + "}"
markdown_string = markdown_string.replace(old_bold_text, new_bold_text)
# convert italic
italics = re.findall(r"\*([^\*]*)\*", markdown_string)
if italics is not None:
for italic_text in italics:
old_italic_text = f"*{italic_text}*"
new_italic_text = "\\textit{" + italic_text + "}"
markdown_string = markdown_string.replace(old_italic_text, new_italic_text)
# convert code
codes = re.findall(r"`([^`]*)`", markdown_string)
if codes is not None:
for code_text in codes:
old_code_text = f"`{code_text}`"
new_code_text = "\\texttt{" + code_text + "}"
markdown_string = markdown_string.replace(old_code_text, new_code_text)
latex_string = markdown_string
return latex_string
@time_the_event_below("Reading and validating the input file")
def read_input_file(file_path: pathlib.Path) -> RenderCVDataModel:
"""Read the input file and return an instance of RenderCVDataModel.
This function reads the input file, converts the markdown strings to $\\LaTeX$, and
validates the input file with the data models.
Args:
file_path (str): The path to the input file.
Returns:
str: The input file as a string.
"""
# check if the file exists:
if not os.path.exists(file_path):
raise FileNotFoundError(f"The input file {file_path} doesn't exist.")
# check the file extension:
accepted_extensions = [".yaml", ".yml", ".json", ".json5"]
if file_path.suffix not in accepted_extensions:
raise ValueError(
"The input file should have one of the following extensions:"
f" {accepted_extensions}. The input file is {file_path}."
)
with open(file_path) as file:
file_content = file.read()
parsed_dictionary: dict[str, Any] = strictyaml.load(file_content).data # type: ignore
def loop_through_dictionary(dictionary: dict[str, Any]) -> dict[str, Any]:
"""Recursively loop through a dictionary and apply `markdown_to_latex` and
`escape_latex_characters` to all the fields.
Args:
dictionary (dict[str, Any]): The dictionary to loop through.
Returns:
dict[str, Any]: The dictionary with markdown_to_latex and
escape_latex_characters applied to all the fields.
"""
for key, value in dictionary.items():
if isinstance(value, str):
# if the value is a string, then apply markdown_to_latex and
# escape_latex_characters to it:
result = escape_latex_characters(value)
dictionary[key] = markdown_to_latex(result)
elif isinstance(value, list):
# if the value is a list, then loop through the list and apply
# markdown_to_latex and escape_latex_characters to each item:
for index, item in enumerate(value):
if isinstance(item, str):
result = escape_latex_characters(item)
dictionary[key][index] = markdown_to_latex(result)
elif isinstance(item, dict):
# if the item is a dictionary, then call loop_through_dictionary
# again:
dictionary[key][index] = loop_through_dictionary(item)
elif isinstance(value, dict):
# if the value is a dictionary, then call loop_through_dictionary again:
dictionary[key] = loop_through_dictionary(value)
return dictionary
parsed_dictionary = loop_through_dictionary(parsed_dictionary)
# validate the parsed dictionary by creating an instance of RenderCVDataModel:
data = RenderCVDataModel(**parsed_dictionary) ## type: ignore
return data
def generate_json_schema(json_schema_path: pathlib.Path):
"""Generate the JSON schema of the data model and save it to a file.
JSON schema is generated for the users to make it easier for them to write the input
file. The JSON Schema of RenderCV is saved in the `docs` directory of the repository
and distributed to the users with the
[JSON Schema Store](https://www.schemastore.org/).
Args:
json_schema_path (str): The path to save the JSON schema.
""" """
class RenderCVSchemaGenerator(pydantic.json_schema.GenerateJsonSchema): class RenderCVSchemaGenerator(pydantic.json_schema.GenerateJsonSchema):
@ -870,186 +1086,5 @@ def generate_json_schema(output_directory: str) -> str:
# Change all anyOf to oneOf # Change all anyOf to oneOf
schema = schema.replace('"anyOf"', '"oneOf"') schema = schema.replace('"anyOf"', '"oneOf"')
path_to_schema = os.path.join(output_directory, "schema.json") with open(json_schema_path, "w") as f:
with open(path_to_schema, "w") as f:
f.write(schema) f.write(schema)
return path_to_schema
def escape_latex_characters(sentence: str) -> str:
"""Escape LaTeX characters in a string.
Example:
```python
escape_latex_characters("This is a # string.")
```
will return:
`#!python "This is a \\# string."`
"""
# Dictionary of escape characters:
escape_characters = {
"#": r"\#",
# "$": r"\$", # Don't escape $ as it is used for math mode
"%": r"\%",
"&": r"\&",
"~": r"\textasciitilde{}",
# "_": r"\_", # Don't escape _ as it is used for math mode
# "^": r"\textasciicircum{}", # Don't escape ^ as it is used for math mode
}
# Don't escape links as hyperref package will do it automatically:
# Find all the links in the sentence:
links = re.findall(r"\[.*?\]\(.*?\)", sentence)
# Replace the links with a placeholder:
for link in links:
sentence = sentence.replace(link, "!!-link-!!")
# Loop through the letters of the sentence and if you find an escape character,
# replace it with its LaTeX equivalent:
copy_of_the_sentence = sentence
for character in copy_of_the_sentence:
if character in escape_characters:
sentence = sentence.replace(character, escape_characters[character])
# Replace the links with the original links:
for link in links:
sentence = sentence.replace("!!-link-!!", link)
return sentence
def markdown_to_latex(markdown_string: str) -> str:
"""Convert a markdown string to LaTeX.
This function is used as a Jinja2 filter.
Example:
```python
markdown_to_latex("This is a **bold** text with an [*italic link*](https://google.com).")
```
will return:
`#!pytjon "This is a \\textbf{bold} text with a \\href{https://google.com}{\\textit{link}}."`
Args:
markdown_string (str): The markdown string to convert.
Returns:
str: The LaTeX string.
"""
# convert links
links = re.findall(r"\[([^\]\[]*)\]\((.*?)\)", markdown_string)
if links is not None:
for link in links:
link_text = link[0]
link_url = link[1]
old_link_string = f"[{link_text}]({link_url})"
new_link_string = "\\href{" + link_url + "}{" + link_text + "}"
markdown_string = markdown_string.replace(old_link_string, new_link_string)
# convert bold
bolds = re.findall(r"\*\*([^\*]*)\*\*", markdown_string)
if bolds is not None:
for bold_text in bolds:
old_bold_text = f"**{bold_text}**"
new_bold_text = "\\textbf{" + bold_text + "}"
markdown_string = markdown_string.replace(old_bold_text, new_bold_text)
# convert italic
italics = re.findall(r"\*([^\*]*)\*", markdown_string)
if italics is not None:
for italic_text in italics:
old_italic_text = f"*{italic_text}*"
new_italic_text = "\\textit{" + italic_text + "}"
markdown_string = markdown_string.replace(old_italic_text, new_italic_text)
# convert code
codes = re.findall(r"`([^`]*)`", markdown_string)
if codes is not None:
for code_text in codes:
old_code_text = f"`{code_text}`"
new_code_text = "\\texttt{" + code_text + "}"
markdown_string = markdown_string.replace(old_code_text, new_code_text)
latex_string = markdown_string
return latex_string
def read_input_file(file_path: str) -> RenderCVDataModel:
"""Read the input file and return an instance of RenderCVDataModel.
Args:
file_path (str): The path to the input file.
Returns:
str: The input file as a string.
"""
start_time = time.time()
information(f"Reading and validating the input file {file_path} has started.")
# check if the file exists:
if not os.path.exists(file_path):
raise FileNotFoundError(f"The input file {file_path} doesn't exist.")
# check the file extension:
accepted_extensions = [".yaml", ".yml", ".json", ".json5"]
if not any(file_path.endswith(extension) for extension in accepted_extensions):
raise ValueError(
f"The file {file_path} doesn't have an accepted extension!"
f" Accepted extensions are: {accepted_extensions}"
)
with open(file_path) as file:
file_content = file.read()
parsed_dictionary: dict[str, Any] = strictyaml.load(file_content).data # type: ignore
def loop_through_dictionary(dictionary: dict[str, Any]) -> dict[str, Any]:
"""Recursively loop through a dictionary and apply markdown_to_latex and
escape_latex_characters to all the fields.
Args:
dictionary (dict[str, Any]): The dictionary to loop through.
Returns:
dict[str, Any]: The dictionary with markdown_to_latex and
escape_latex_characters applied to all the fields.
"""
for key, value in dictionary.items():
if isinstance(value, str):
result = escape_latex_characters(value)
dictionary[key] = markdown_to_latex(result)
elif isinstance(value, list):
for index, item in enumerate(value):
if isinstance(item, str):
result = escape_latex_characters(item)
dictionary[key][index] = markdown_to_latex(result)
elif isinstance(item, dict):
dictionary[key][index] = loop_through_dictionary(item)
elif isinstance(value, dict):
dictionary[key] = loop_through_dictionary(value)
return dictionary
parsed_dictionary = loop_through_dictionary(parsed_dictionary)
data = RenderCVDataModel(**parsed_dictionary) ## type: ignore
end_time = time.time()
time_taken = end_time - start_time
information(
f"Reading and validating the input file {file_path} has finished in"
f" {time_taken:.2f} s."
)
return data