handle markdown to LaTeX differently

This commit is contained in:
Sina Atalay 2024-02-13 20:00:16 +01:00
parent 4b6d0d1634
commit ef741249d3
2 changed files with 198 additions and 194 deletions

View File

@ -23,7 +23,6 @@ import json
import re
import ssl
import pathlib
import copy
import pydantic
import pydantic_extra_types.phone_numbers as pydantic_phone_numbers
@ -1071,193 +1070,9 @@ class RenderCVDataModel(RenderCVBaseModel):
return theme_data_model
def escape_latex_characters(string: str) -> str:
"""Escape $\\LaTeX$ characters in a string.
This function is called during the reading of the input file. Before the validation
process, each input field's special $\\LaTeX$ characters are escaped.
Example:
```python
escape_latex_characters("This is a # string.")
```
will return:
`#!python "This is a \\# string."`
"""
# Dictionary of escape characters:
escape_characters = {
"#": "\\#",
# "$": "\\$", # Don't escape $ as it is used for math mode
"%": "\\%",
"&": "\\&",
"~": "\\textasciitilde{}",
# "_": "\\_", # Don't escape _ as it is used for math mode
# "^": "\\textasciicircum{}", # Don't escape ^ as it is used for math mode
}
# Don't escape links as hyperref package will do it automatically:
# Find all the links in the sentence:
links = re.findall(r"\[.*?\]\(.*?\)", string)
# Replace the links with a placeholder:
for link in links:
string = string.replace(link, "!!-link-!!")
# Loop through the letters of the sentence and if you find an escape character,
# replace it with its LaTeX equivalent:
copy_of_the_string = list(string)
for i, character in enumerate(copy_of_the_string):
if character in escape_characters:
new_character = escape_characters[character]
copy_of_the_string[i] = new_character
string = "".join(copy_of_the_string)
# Replace the links with the original links:
for link in links:
string = string.replace("!!-link-!!", link)
return string
def markdown_to_latex(markdown_string: str) -> str:
"""Convert a markdown string to LaTeX.
This function is called during the reading of the input file. Before the validation
process, each input field is converted from markdown to LaTeX.
Example:
```python
markdown_to_latex("This is a **bold** text with an [*italic link*](https://google.com).")
```
will return:
`#!pytjon "This is a \\textbf{bold} text with a \\href{https://google.com}{\\textit{link}}."`
Args:
markdown_string (str): The markdown string to convert.
Returns:
str: The LaTeX string.
"""
# convert links
links = re.findall(r"\[([^\]\[]*)\]\((.*?)\)", markdown_string)
if links is not None:
for link in links:
link_text = link[0]
link_url = link[1]
old_link_string = f"[{link_text}]({link_url})"
new_link_string = "\\href{" + link_url + "}{" + link_text + "}"
markdown_string = markdown_string.replace(old_link_string, new_link_string)
# convert bold
bolds = re.findall(r"\*\*([^\*]*)\*\*", markdown_string)
if bolds is not None:
for bold_text in bolds:
old_bold_text = f"**{bold_text}**"
new_bold_text = "\\textbf{" + bold_text + "}"
markdown_string = markdown_string.replace(old_bold_text, new_bold_text)
# convert italic
italics = re.findall(r"\*([^\*]*)\*", markdown_string)
if italics is not None:
for italic_text in italics:
old_italic_text = f"*{italic_text}*"
new_italic_text = "\\textit{" + italic_text + "}"
markdown_string = markdown_string.replace(old_italic_text, new_italic_text)
# convert code
codes = re.findall(r"`([^`]*)`", markdown_string)
if codes is not None:
for code_text in codes:
old_code_text = f"`{code_text}`"
new_code_text = "\\texttt{" + code_text + "}"
markdown_string = markdown_string.replace(old_code_text, new_code_text)
latex_string = markdown_string
return latex_string
def convert_a_markdown_dictionary_to_a_latex_dictionary(
dictionary: dict[str, Any],
) -> dict[str, Any]:
"""
Recursively loop through a dictionary and convert all the markdown strings (keys and
values) to LaTeX. Also, escape special LaTeX characters in the keys and values.
Example:
```python
convert_a_markdown_dictionary_to_a_latex_dictionary(
{
"key1": "This is a **bold** text with an [*italic link*](https://google.com).",
"key2": "This is a **bold** text with an [*italic link*](https://google.com).",
"**key3**": {
"key4": "This is a **bold** text with an [*italic link*](https://google.com).",
"key5": "This is a **bold** text with an [*italic link*](https://google.com).",
},
}
)
```
will return:
```python
{
"key1": "This is a \\textbf{bold} text with a \\href{https://google.com}{\\textit{link}}.",
"key2": "This is a \\textbf{bold} text with a \\href{https://google.com}{\\textit{link}}.",
"\\textbf{key3}": {
"key4": "This is a \\textbf{bold} text with a \\href{https://google.com}{\\textit{link}}.",
"key5": "This is a \\textbf{bold} text with a \\href{https://google.com}{\\textit{link}}.",
},
}
```
Args:
dictionary (dict): The dictionary to convert.
Returns:
dict: The LaTeX dictionary.
"""
for key, value in dictionary.copy().items():
if isinstance(value, str):
# if the value is a string, then apply markdown_to_latex and
# escape_latex_characters to it:
result = escape_latex_characters(value)
dictionary[key] = markdown_to_latex(result)
elif isinstance(value, list):
# if the value is a list, then loop through the list and apply
# markdown_to_latex and escape_latex_characters to each item:
for index, item in enumerate(value):
if isinstance(item, str):
result = escape_latex_characters(item)
dictionary[key][index] = markdown_to_latex(result)
elif isinstance(item, dict):
# if the item is a dictionary, then call loop_through_dictionary
# again:
dictionary[key][index] = (
convert_a_markdown_dictionary_to_a_latex_dictionary(item)
)
elif isinstance(value, dict):
# if the value is a dictionary, then call loop_through_dictionary again:
dictionary[key] = convert_a_markdown_dictionary_to_a_latex_dictionary(value)
# do the same for the key:
result = escape_latex_characters(key)
dictionary[markdown_to_latex(result)] = dictionary.pop(key)
return dictionary
def read_input_file(
file_path: pathlib.Path,
) -> tuple[RenderCVDataModel, RenderCVDataModel]:
) -> RenderCVDataModel:
"""Read the input file and return two instances of RenderCVDataModel. The first
instance is the data model with LaTeX strings and the second instance is the data
model with markdown strings.
@ -1282,16 +1097,12 @@ def read_input_file(
)
file_content = file_path.read_text(encoding="utf-8")
original_dictionary: dict[str, Any] = ruamel.yaml.YAML().load(file_content)
parsed_dictionary = convert_a_markdown_dictionary_to_a_latex_dictionary(
copy.deepcopy(original_dictionary)
)
input_as_dictionary: dict[str, Any] = ruamel.yaml.YAML().load(file_content)
# validate the parsed dictionary by creating an instance of RenderCVDataModel:
data_model_markdown = RenderCVDataModel(**original_dictionary)
data_model_latex = RenderCVDataModel(**parsed_dictionary)
rendercv_data_model = RenderCVDataModel(**input_as_dictionary)
return data_model_latex, data_model_markdown
return rendercv_data_model
def get_a_sample_data_model(name: str) -> RenderCVDataModel:

View File

@ -15,12 +15,12 @@ import pathlib
import importlib.resources
import shutil
import sys
import copy
from datetime import date as Date
from typing import Optional, Literal, Any
import jinja2
import markdown
import fpdf
from . import data_models as dm
@ -134,6 +134,16 @@ class LaTeXFile(TemplatedFile):
data model and Jinja2 templates. It inherits from the TemplatedFile class.
"""
def __init__(
self,
data_model: dm.RenderCVDataModel,
environment: jinja2.Environment,
):
data_model = transform_markdown_data_model_to_latex_data_model(
copy.deepcopy(data_model)
)
super().__init__(data_model, environment)
def render_templates(self):
"""Render and return all the templates for the $\\LaTeX$ file.
@ -312,6 +322,189 @@ class MarkdownFile(TemplatedFile):
file_path.write_text(self.get_markdown_code(), encoding="utf-8")
def escape_latex_characters(string: str) -> str:
"""Escape $\\LaTeX$ characters in a string.
This function is called during the reading of the input file. Before the validation
process, each input field's special $\\LaTeX$ characters are escaped.
Example:
```python
escape_latex_characters("This is a # string.")
```
will return:
`#!python "This is a \\# string."`
"""
# Dictionary of escape characters:
escape_characters = {
"#": "\\#",
# "$": "\\$", # Don't escape $ as it is used for math mode
"%": "\\%",
"&": "\\&",
"~": "\\textasciitilde{}",
# "_": "\\_", # Don't escape _ as it is used for math mode
# "^": "\\textasciicircum{}", # Don't escape ^ as it is used for math mode
}
# Don't escape links as hyperref package will do it automatically:
# Find all the links in the sentence:
links = re.findall(r"\[.*?\]\(.*?\)", string)
# Replace the links with a placeholder:
for link in links:
string = string.replace(link, "!!-link-!!")
# Loop through the letters of the sentence and if you find an escape character,
# replace it with its LaTeX equivalent:
copy_of_the_string = list(string)
for i, character in enumerate(copy_of_the_string):
if character in escape_characters:
new_character = escape_characters[character]
copy_of_the_string[i] = new_character
string = "".join(copy_of_the_string)
# Replace the links with the original links:
for link in links:
string = string.replace("!!-link-!!", link)
return string
def markdown_to_latex(markdown_string: str) -> str:
"""Convert a markdown string to LaTeX.
This function is called during the reading of the input file. Before the validation
process, each input field is converted from markdown to LaTeX.
Example:
```python
markdown_to_latex("This is a **bold** text with an [*italic link*](https://google.com).")
```
will return:
`#!pytjon "This is a \\textbf{bold} text with a \\href{https://google.com}{\\textit{link}}."`
Args:
markdown_string (str): The markdown string to convert.
Returns:
str: The LaTeX string.
"""
# convert links
links = re.findall(r"\[([^\]\[]*)\]\((.*?)\)", markdown_string)
if links is not None:
for link in links:
link_text = link[0]
link_url = link[1]
old_link_string = f"[{link_text}]({link_url})"
new_link_string = "\\href{" + link_url + "}{" + link_text + "}"
markdown_string = markdown_string.replace(old_link_string, new_link_string)
# convert bold
bolds = re.findall(r"\*\*([^\*]*)\*\*", markdown_string)
if bolds is not None:
for bold_text in bolds:
old_bold_text = f"**{bold_text}**"
new_bold_text = "\\textbf{" + bold_text + "}"
markdown_string = markdown_string.replace(old_bold_text, new_bold_text)
# convert italic
italics = re.findall(r"\*([^\*]*)\*", markdown_string)
if italics is not None:
for italic_text in italics:
old_italic_text = f"*{italic_text}*"
new_italic_text = "\\textit{" + italic_text + "}"
markdown_string = markdown_string.replace(old_italic_text, new_italic_text)
# convert code
codes = re.findall(r"`([^`]*)`", markdown_string)
if codes is not None:
for code_text in codes:
old_code_text = f"`{code_text}`"
new_code_text = "\\texttt{" + code_text + "}"
markdown_string = markdown_string.replace(old_code_text, new_code_text)
latex_string = markdown_string
return latex_string
def transform_markdown_data_model_to_latex_data_model(
data_model: dm.RenderCVDataModel,
) -> dm.RenderCVDataModel:
"""
Recursively loop through a `RenderCVDataModel` and convert all the markdown strings
(user input is in markdown format) to LaTeX strings. Also, escape special LaTeX
characters.
Args:
data_model (RenderCVDataModel): The data model to transform.
Returns:
dict: The data model with LaTeX strings.
"""
data_model_as_dict = data_model.model_dump()
for key, value in data_model_as_dict.items():
if isinstance(value, str):
# if the value is a string, then apply markdown_to_latex and
# escape_latex_characters to it:
result = markdown_to_latex(escape_latex_characters(value))
# update data_model object's attribute with the new value:
setattr(data_model, key, result)
elif isinstance(value, list):
# if the value is a list, then loop through the list and apply
# markdown_to_latex and escape_latex_characters to each item:
transformed_list = []
for index, item in enumerate(value):
if isinstance(item, str):
result = markdown_to_latex(escape_latex_characters(item))
transformed_list.append(result)
elif isinstance(item, dict):
# if the item is a dictionary, then it means it's a sub data model.
# So, call transform_markdown_data_model_to_latex_data_model again:
sub_data_model = getattr(data_model, key)[index]
transformed_sub_data_model = (
transform_markdown_data_model_to_latex_data_model(
sub_data_model
)
)
transformed_list.append(transformed_sub_data_model)
# update data_model object's attribute with the new value:
setattr(data_model, key, transformed_list)
elif isinstance(value, dict):
if key == "sections_input":
# Then it means it's the `sections` field:
sections = getattr(data_model, key)
for section_title, entries in sections.items():
transformed_entries = []
for entry in entries:
transformed_entry = (
transform_markdown_data_model_to_latex_data_model(entry)
)
transformed_entries.append(transformed_entry)
setattr(data_model, key, sections)
else:
# Then it means it's a sub data model.
# So, call transform_markdown_data_model_to_latex_data_model again:
sub_data_model = getattr(data_model, key)
transformed_sub_data_model = (
transform_markdown_data_model_to_latex_data_model(sub_data_model)
)
# update data_model object's attribute with the new value:
setattr(data_model, key, transformed_sub_data_model)
return data_model
def make_matched_part_something(
value: str, something: str, match_str: Optional[str] = None
) -> str: