Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
d5de345
chore(Beeri Mekhilta): Generate Mekhilta map for Tractates Pischa, Va…
saengel Aug 17, 2023
265cbf3
chore(Beeri Mekhilta): Reformat
saengel Aug 17, 2023
81921a1
chore(Beeri Mekhilta): Experimentation with mapping unmapped masechtot
saengel Aug 24, 2023
0e165b9
chore(Beeri Mekhilta): Prevent early searches to ensure sequential ma…
saengel Aug 28, 2023
78f6c98
chore(Beeri Mekhilta): Generate report, modularize code, refine, add …
saengel Aug 28, 2023
e94ac6b
feat(Beeri Mekhilta): Working cascade for single refs
saengel Sep 11, 2023
7404902
feat(Beeri Mekhilta): First functional pass at cascade
saengel Sep 12, 2023
d2d0aed
chore(Beeri Mekhilta): Added notes
saengel Oct 19, 2023
c6078e4
fix(Beeri Mekhilta): Clean up cascade code
saengel Oct 26, 2023
428ea78
fix(Beeri Mekhilta): Cleaned cascade, reversion streamlined
saengel Oct 26, 2023
6fd8ea9
chore(Beeri Mekhilta): Playing with idx names to avoid rename error
saengel Oct 29, 2023
adbb095
fix(Beeri Mekhilta): Name difference for Beeri version to avoid renam…
saengel Oct 31, 2023
b307ee5
feat(Beeri Mekhilta): Generate extensive link report
saengel Oct 31, 2023
2192f46
chore(Beeri Mekhilta): Refined link report
saengel Nov 5, 2023
dc75874
chore(Beeri Mekhilta): Clean out old CSV files in anticipation of fur…
saengel Nov 30, 2023
f7a4bfb
chore(Beeri Mekhilta): Include link report on branch
saengel Nov 30, 2023
72d64f7
chore(Beeri Mekhilta): Flatten out refs on both sides of data
saengel Dec 4, 2023
b03854f
feat(Beeri Mekhilta): Add script for book renaming
saengel Dec 4, 2023
1f4cbb9
feat(Beeri Mekhilta): In progress script for source sheet mapping
saengel Dec 4, 2023
885b032
chore(Beeri Mekhilta): Flatten ranged refs further
saengel Dec 5, 2023
9f6edbe
chore(Beeri Mekhilta): Fix bugs in source sheet script
saengel Dec 5, 2023
4a63d97
chore(Beeri Mekhilta): Fix minor bugs, update todo
saengel Dec 5, 2023
3c6fe72
chore(Beeri Mekhilta): Add Hebrew to rename books script
saengel Dec 6, 2023
728a17e
feat(Beeri Mekhilta): Handles the sources field in the DB
saengel Dec 6, 2023
5209aaa
chore(Beeri Mekhilta): Fix bug in json generation
saengel Dec 6, 2023
4c03272
feat(Beeri Mekhilta): JSON for sheets remap
saengel Dec 6, 2023
11e7b20
chore(Beeri Mekhilta): Shift to data to match name updates on prod, d…
saengel Dec 18, 2023
d67a555
chore(Beeri Mekhilta): Code for source sheet remap in subdir
saengel Dec 18, 2023
29381f4
chore(Beeri Mekhilta): Update the code to handle Mekhilta linked to i…
saengel Dec 18, 2023
6638375
chore(Beeri Mekhilta): Start organizing and reusing code, clean up pr…
saengel Dec 18, 2023
9f9597e
chore(Beeri Mekhilta): Code to generate mapping with assoc text for QA
saengel Dec 18, 2023
35c37dd
feat(Beeri Mekhilta): Complete linkset from prod as of Dec 18, new so…
saengel Dec 18, 2023
56f0420
chore(Beeri Mekhilta): Generate mapped links as per new linkset
saengel Dec 18, 2023
5c3bc9b
chore(Beeri Mekhilta): Remove source sheet code from parent dir
saengel Dec 18, 2023
d6d29db
chore(Beeri Mekhilta): First pass at using ParallelMatcher for Mesora…
saengel Dec 18, 2023
d5f2746
chore(Beeri Mekhilta): Rename directory
saengel Dec 20, 2023
2d706f8
chore(Beeri Mekhilta): Update rename_books for new prod naming
saengel Dec 20, 2023
cf58b80
chore(Beeri Mekhilta): Remove heavy sheets data
saengel Dec 20, 2023
adc3619
chore(Beeri Mekhilta): Flatten mapping
saengel Jan 2, 2024
aa85261
chore(Beeri Mekhilta): Rewrite using bulk write
saengel Jan 2, 2024
d04007a
feat(Beeri Mekhilta): Working pymongo scripts
saengel Jan 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
2,043 changes: 2,043 additions & 0 deletions sources/Content_Quality/beeri_mekhilta/beeri_en_version.csv

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions sources/Content_Quality/beeri_mekhilta/beeri_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import django

django.setup()

from sefaria.model import *
import csv

def write_to_csv(file_path, dict_list):
# Open the CSV file in write mode
with open(file_path, 'w', newline='') as csv_file:
# Specify the order of columns based on headers or dictionary keys
fieldnames = dict_list[0].keys()

# Create a CSV writer object
csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

# Write data to the CSV file
csv_writer.writerows(dict_list)

print(f"Data has been written to {file_path}")
196 changes: 196 additions & 0 deletions sources/Content_Quality/beeri_mekhilta/create_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
import django

django.setup()

import csv
import re
from sefaria.model import *
from parsing_utilities.text_align import CompareBreaks


def convert_csv():
tref_to_index_map = {}
text_str_by_masechet = {}

with open('beeri_version.csv', newline='') as csvfile:
beeri_csv = csv.reader(csvfile)
for row in beeri_csv:
tref = row[0]
text_segment = row[1]
masechet = re.findall(r"Mekhilta DeRabbi Yishmael Beeri, (.*) [^A-Za-z]*$", tref)[0]
if masechet in text_str_by_masechet:
text_str_by_masechet[masechet].append(text_segment)
tref_to_index_map[masechet].append(tref)
else:
text_str_by_masechet[masechet] = [text_segment]
tref_to_index_map[masechet] = [tref]

return text_str_by_masechet, tref_to_index_map


def get_prod_list_of_strs():
all_text_by_masechet = {}
tref_index_map = {}

category_masechet_map = {
'Tractate Pischa': Ref('Mekhilta d\'Rabbi Yishmael 12:1-13:16'),
'Tractate Vayehi Beshalach': Ref('Mekhilta d\'Rabbi Yishmael 13:17-14:31'),
"Tractate Shirah": Ref('Mekhilta d\'Rabbi Yishmael 15:1-21'),
"Tractate Vayassa": Ref('Mekhilta d\'Rabbi Yishmael 15:22-17:7'),
"Tractate Amalek": Ref('Mekhilta d\'Rabbi Yishmael 17:8-18:27'),
"Tractate Bachodesh": Ref('Mekhilta d\'Rabbi Yishmael 19:1-20:26'),
"Tractate Nezikin": Ref('Mekhilta d\'Rabbi Yishmael 21:1-22:23'),
"Tractate Kaspa": Ref('Mekhilta d\'Rabbi Yishmael 22:24-23:19'),
"Tractate Shabbata": Ref('Mekhilta d\'Rabbi Yishmael 31:12-35:3')
}

def action(segment_str, tref, he_tref, version):
nonlocal all_text_by_masechet
nonlocal tref_index_map

for masechet in category_masechet_map:
if Ref(tref) in category_masechet_map[masechet].all_segment_refs():
if masechet in all_text_by_masechet:
all_text_by_masechet[masechet].append(segment_str)
tref_index_map[masechet].append(tref)
else:
all_text_by_masechet[masechet] = [segment_str]
tref_index_map[masechet] = [tref]

mekhilta_version = Version().load({'versionTitle': "Mechilta, translated by Rabbi Shraga Silverstein"})
mekhilta_version.walk_thru_contents(action)
return all_text_by_masechet, tref_index_map


def write_to_csv(masechet, map):
# Get the header row
header = map[0].keys()

# Create a CSV writer object
with open(f"{masechet}_mapping.csv", "w") as f:
writer = csv.writer(f)
# Write the header row
writer.writerow(header)
# Write the data rows
for row in map:
writer.writerow(row.values())
print(f"Wrote {masechet} to CSV")


def normalize_string(text_segment):
return ''.join(letter for letter in text_segment if letter.isalnum())


def compare_text_approach(beeri_text_strs, beeri_tref_to_index_map, prod_text_strs, prod_tref_to_index_map):
"""Note: This only worked for 6/9 of the Masechtot, which is why we filter the "breaking" masechtot on top."""
for masechet in beeri_text_strs:

if masechet not in ["Tractate Nezikin", "Tractate Shirah", "Tractate Vayehi Beshalach"]:
cb = CompareBreaks(beeri_text_strs[masechet], prod_text_strs[masechet])
map = cb.create_mapping()

expanded_mapping = []

for beeri_idx in map:
prod_idx = list(map[beeri_idx])
if prod_idx:
prod_idx = prod_idx[0]
expanded_mapping.append({"Beeri Ref": beeri_tref_to_index_map[masechet][beeri_idx - 1],
"Wiki Ref": prod_tref_to_index_map[masechet][prod_idx - 1]})
write_to_csv(masechet, expanded_mapping)


def normalize_text(beeri_text_strs, prod_text_strs):
normalized_prod_text_strs = {}
normalized_beeri_text_strs = {}

for masechet in beeri_text_strs:
for seg in prod_text_strs[masechet]:
if masechet in normalized_prod_text_strs:
normalized_prod_text_strs[masechet].append(normalize_string(seg))
else:
normalized_prod_text_strs[masechet] = [normalize_string(seg)]
for seg in beeri_text_strs[masechet]:
if masechet in normalized_beeri_text_strs:
normalized_beeri_text_strs[masechet].append(normalize_string(seg))
else:
normalized_beeri_text_strs[masechet] = [normalize_string(seg)]
return normalized_beeri_text_strs, normalized_prod_text_strs


def brute_force_mapping(normalized_beeri, normalized_prod, beeri_tref_to_index_map, prod_tref_to_index_map):
"""Brute force approach to get a quasi-map for the failing masechtot (Nezikin, Vayehi Beshalach, and Shirah) -
which failed the compare breaks approach. The generated CSV will need some manual work. """
for masechet in ["Tractate Nezikin", "Tractate Shirah", "Tractate Vayehi Beshalach"]:
expanded_mapping = []
prod_list = normalized_prod[masechet]
for i in range(len(normalized_beeri[masechet])):
beeri_segment = normalized_beeri[masechet][i]
if not beeri_segment:
continue
for j, prod_seg in enumerate(prod_list):
prod_segment = normalized_prod[masechet][j]

if beeri_segment[:100] in prod_segment:
beeri_ref = beeri_tref_to_index_map[masechet][i]
prod_ref = prod_tref_to_index_map[masechet][j]
expanded_mapping.append({"Beeri Ref": beeri_ref, "Prod Ref": prod_ref})
prod_list[j:]

write_to_csv(f"manual_report_{masechet}", expanded_mapping)


def generate_report_beeri():
# Set up "all" list
beeri_refs_all = []
with open('beeri_version.csv', newline='') as csvfile:
beeri_csv = csv.reader(csvfile)
for row in beeri_csv:
tref = row[0]
masechet = re.findall(r"Mekhilta DeRabbi Yishmael Beeri, (.*) [^A-Za-z]*$", tref)[0]
if masechet in ["Tractate Nezikin", "Tractate Shirah", "Tractate Vayehi Beshalach"]:
beeri_refs_all.append(tref)

# Set up "processed" list
beeri_refs_processed = []
with open('manual_report_Tractate Vayehi Beshalach_mapping.csv', newline='') as csvfile:
beeri_csv = csv.reader(csvfile)
for row in beeri_csv:
tref = row[0]
beeri_refs_processed.append(tref)

with open('manual_report_Tractate Shirah_mapping.csv', newline='') as csvfile:
beeri_csv = csv.reader(csvfile)
for row in beeri_csv:
tref = row[0]
beeri_refs_processed.append(tref)

with open('manual_report_Tractate Nezikin_mapping.csv', newline='') as csvfile:
beeri_csv = csv.reader(csvfile)
for row in beeri_csv:
tref = row[0]
beeri_refs_processed.append(tref)

unreported_refs = set(beeri_refs_all) - set(beeri_refs_processed)
for ref in unreported_refs:
print(ref)


if __name__ == "__main__":
csv_file = "beeri_version.csv"
beeri_text_strs, beeri_tref_to_index_map = convert_csv()
prod_text_strs, prod_tref_to_index_map = get_prod_list_of_strs()

# Compare Text Approach to map Masechtot Pischa, Vayassa, Amalek, Bachodesh, Kaspa and Shabbata
compare_text_approach(beeri_text_strs, beeri_tref_to_index_map, prod_text_strs, prod_tref_to_index_map)

# Approach for "breaking" masechtot: Vayehi Beshalach, Shirah and Nezikin

# Step One: Normalize English text
normalized_beeri, normalized_prod = normalize_text(beeri_text_strs, prod_text_strs)

# Step Two: Apply a Brute Force mapping algorithm
brute_force_mapping(normalized_beeri, normalized_prod, beeri_tref_to_index_map, prod_tref_to_index_map)

# Step Three: Generate a report of missing segments to aid manual work
generate_report_beeri()
79 changes: 79 additions & 0 deletions sources/Content_Quality/beeri_mekhilta/fix_cascade.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import django

django.setup()

from sefaria.model import *
from sefaria.helper.schema import cascade
import csv


def rename_books():
# Todo - cascade name change? Index should cascade...
index_query = {"title": "Mekhilta DeRabbi Yishmael"}
index = Index().load(index_query)
print(f"Retrieved {index.title}")
index.set_title("Old Mekhilta d'Rabbi Yishmael")
index.save()
print(f"Saved and renamed {index.title}")

index_query = {"title": "Mekhilta DeRabbi Yishmael Beeri"}
index = Index().load(index_query)
print(f"Retrieved {index.title}")
index.set_title("Mekhilta d'Rabbi Yishmael")
index.save()
print(f"Saved and renamed {index.title}")


def ingest_map():
with open("full_mapping.csv", "r") as f:
reader = csv.DictReader(f)
map = {}
for row in reader:
if row["prod_ref"] in map:
map[row["prod_ref"]].append(row["beeri_ref"])
else:
map[row["prod_ref"]] = [row["beeri_ref"]]
return map


def rewriter_function(prod_ref):
mapper = ingest_map()
print(prod_ref)

# If a segment-level ref
if prod_ref in mapper:
cur_beeri_ref_list = mapper[prod_ref]

# If the segment-level ref maps to multiple Beeri refs
if len(cur_beeri_ref_list) > 1:
first_ref = cur_beeri_ref_list[0]
last_ref = cur_beeri_ref_list[len(cur_beeri_ref_list) - 1]
ranged_ref = Ref(first_ref).to(last_ref)
return ranged_ref.normal()

# If the segment-level ref maps to a single Beeri ref
else:
return cur_beeri_ref_list[0]

# If a section or book level ref, determine the mapping based on the segment refs within
elif not Ref(prod_ref).is_segment_level():
segment_ranged_ref = Ref(prod_ref).as_ranged_segment_ref()
if segment_ranged_ref.starting_ref() in mapper and segment_ranged_ref.ending_ref() in mapper:
first_ref = mapper[segment_ranged_ref.starting_ref()]
last_ref = mapper[segment_ranged_ref.ending_ref()]
ranged_ref = Ref(first_ref).to(last_ref)
return ranged_ref.normal()
else:
# Catches weird/wrong refs, like Mekhilta 1, which does not exist - but according to the code will become Mekhilta 1:1:1
print(f"ERROR: {prod_ref} was not handled by rewriter")
return ""
else:
print(f"ERROR: {prod_ref} was not handled by rewriter")
return ""


if __name__ == '__main__':
# Run the following function once on DB refresh, make sure Mekhilta copied from Piaczena DB (index & text)
rename_books()

# cascade("Mekhilta d'Rabbi Yishmael Old", rewriter=rewriter_function, skip_history=False)
Loading