diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_list.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_list.tsv new file mode 100644 index 000000000..6fcfb8b3a --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_list.tsv @@ -0,0 +1,5 @@ +हफ़्ते +सप्ताह +सदियां +सदियों + diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_map.tsv similarity index 71% rename from nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv rename to nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_map.tsv index 5466df709..dc20bcb21 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_map.tsv @@ -8,7 +8,4 @@ hp हॉर्सपॉवर d दिन month महीना months महीने -हफ़्ते -सप्ताह -सदियां -सदियों + diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/exceptions.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/exceptions.tsv new file mode 100644 index 000000000..bfe5738d0 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/exceptions.tsv @@ -0,0 +1,12 @@ +१ला पहला +१ली पहली +२रा दूसरा +२री दूसरी +३रा तीसरा +३री तीसरी +४था चौथा +४थी चौथी +५वां पाँचवां +५वीं पाँचवीं +६ठा छठा +६ठी छठी diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv index 37cd2af06..922e9d6b8 100644 --- a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv @@ -1,4 +1,3 @@ वां वीं वें -वे वें \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv new file mode 100644 index 000000000..77139cff5 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv @@ -0,0 +1,2 @@ +वे वें + diff --git a/nemo_text_processing/text_normalization/hi/graph_utils.py b/nemo_text_processing/text_normalization/hi/graph_utils.py index 6a5d3c699..5bbc736fd 100644 --- a/nemo_text_processing/text_normalization/hi/graph_utils.py +++ b/nemo_text_processing/text_normalization/hi/graph_utils.py @@ -30,6 +30,13 @@ NEMO_HI_DIGIT = pynini.union("०", "१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() NEMO_HI_NON_ZERO = pynini.union("१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() NEMO_HI_ZERO = "०" + +HI_DEDH = "डेढ़" # 1.5 +HI_DHAI = "ढाई" # 2.5 +HI_SAVVA = "सवा" # quarter more (1.25) +HI_SADHE = "साढ़े" # half more (X.5) +HI_PAUNE = "पौने" # quarter less (0.75) + NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 37b192165..b25abcac6 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -65,11 +65,11 @@ def __init__(self, cardinal: GraphFst): (NEMO_HI_DIGIT + NEMO_HI_NON_ZERO + NEMO_HI_DIGIT + NEMO_HI_DIGIT), cardinal.graph_hundreds_as_thousand ) - cardinal_graph = ( - digit | teens_and_ties | cardinal.graph_hundreds | graph_year_thousands | graph_year_hundreds_as_thousands + cardinal_graph = pynini.union( + digit, teens_and_ties, cardinal.graph_hundreds, graph_year_thousands, graph_year_hundreds_as_thousands ) - graph_year = graph_year_thousands | graph_year_hundreds_as_thousands + graph_year = pynini.union(graph_year_thousands, graph_year_hundreds_as_thousands) delete_dash = pynutil.delete("-") delete_slash = pynutil.delete("/") @@ -102,13 +102,10 @@ def __init__(self, cardinal: GraphFst): # Updated logic to use prefix_union year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"") - graph_dd_mm_yyyy = ( - days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph - ) + delete_separator = pynini.union(delete_dash, delete_slash) + graph_dd_mm_yyyy = days_graph + delete_separator + months_graph + delete_separator + years_graph - graph_mm_dd_yyyy = ( - months_graph + (delete_dash | delete_slash) + days_graph + (delete_dash | delete_slash) + years_graph - ) + graph_mm_dd_yyyy = months_graph + delete_separator + days_graph + delete_separator + years_graph graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index d995608da..b5528deba 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -15,9 +15,21 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hi.graph_utils import ( + HI_DEDH, + HI_DHAI, + HI_PAUNE, + HI_SADHE, + HI_SAVVA, + NEMO_SPACE, + GraphFst, +) from nemo_text_processing.text_normalization.hi.utils import get_abs_path +HI_ONE_HALF = "१/२" # 1/2 +HI_ONE_QUARTER = "१/४" # 1/4 +HI_THREE_QUARTERS = "३/४" # 3/4 + class FractionFst(GraphFst): """ @@ -40,37 +52,62 @@ def __init__(self, cardinal, deterministic: bool = True): cardinal_graph = cardinal.final_graph self.optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1 + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + pynutil.insert(NEMO_SPACE), 0, 1 ) self.integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") self.numerator = ( - pynutil.insert("numerator: \"") + cardinal_graph + pynini.cross(pynini.union("/", " / "), "\" ") + pynutil.insert("numerator: \"") + + cardinal_graph + + pynini.cross(pynini.union("/", NEMO_SPACE + "/" + NEMO_SPACE), "\"") + + pynutil.insert(NEMO_SPACE) ) self.denominator = pynutil.insert("denominator: \"") + cardinal_graph + pynutil.insert("\"") - dedh_dhai_graph = pynini.string_map([("१ १/२", "डेढ़"), ("२ १/२", "ढाई")]) + dedh_dhai_graph = pynini.string_map( + [("१" + NEMO_SPACE + HI_ONE_HALF, HI_DEDH), ("२" + NEMO_SPACE + HI_ONE_HALF, HI_DHAI)] + ) - savva_numbers = cardinal_graph + pynini.cross(" १/४", "") - savva_graph = pynutil.insert("सवा ") + savva_numbers + savva_numbers = cardinal_graph + pynini.cross(NEMO_SPACE + HI_ONE_QUARTER, "") + savva_graph = pynutil.insert(HI_SAVVA) + pynutil.insert(NEMO_SPACE) + savva_numbers - sadhe_numbers = cardinal_graph + pynini.cross(" १/२", "") - sadhe_graph = pynutil.insert("साढ़े ") + sadhe_numbers + sadhe_numbers = cardinal_graph + pynini.cross(NEMO_SPACE + HI_ONE_HALF, "") + sadhe_graph = pynutil.insert(HI_SADHE) + pynutil.insert(NEMO_SPACE) + sadhe_numbers paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) - paune_numbers = paune + pynini.cross(" ३/४", "") - paune_graph = pynutil.insert("पौने ") + paune_numbers - - graph_dedh_dhai = pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph + pynutil.insert("\" ") + paune_numbers = paune + pynini.cross(NEMO_SPACE + HI_THREE_QUARTERS, "") + paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers + + graph_dedh_dhai = ( + pynutil.insert("morphosyntactic_features: \"") + + dedh_dhai_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) - graph_savva = pynutil.insert("morphosyntactic_features: \"") + savva_graph + pynutil.insert("\" ") + graph_savva = ( + pynutil.insert("morphosyntactic_features: \"") + + savva_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) - graph_sadhe = pynutil.insert("morphosyntactic_features: \"") + sadhe_graph + pynutil.insert("\" ") + graph_sadhe = ( + pynutil.insert("morphosyntactic_features: \"") + + sadhe_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) - graph_paune = pynutil.insert("morphosyntactic_features: \"") + paune_graph + pynutil.insert("\" ") + graph_paune = ( + pynutil.insert("morphosyntactic_features: \"") + + paune_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) final_graph = ( self.optional_graph_negative - + pynini.closure(self.integer + pynini.accep(" "), 0, 1) + + pynini.closure(self.integer + pynini.accep(NEMO_SPACE), 0, 1) + self.numerator + self.denominator ) diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 575b3d5d5..b7d74731e 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -15,9 +15,24 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.hi.graph_utils import ( + HI_DEDH, + HI_DHAI, + HI_PAUNE, + HI_SADHE, + HI_SAVVA, + NEMO_SPACE, + GraphFst, + delete_space, + insert_space, +) from nemo_text_processing.text_normalization.hi.utils import get_abs_path +HI_POINT_FIVE = ".५" # .5 +HI_ONE_POINT_FIVE = "१.५" # 1.5 +HI_TWO_POINT_FIVE = "२.५" # 2.5 +HI_DECIMAL_25 = ".२५" # .25 +HI_DECIMAL_75 = ".७५" # .75 digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) @@ -54,7 +69,11 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) - quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv")) + + # Load quarterly units from separate files: map (FST) and list (FSA) + quarterly_units_map = pynini.string_file(get_abs_path("data/measure/quarterly_units_map.tsv")) + quarterly_units_list = pynini.string_file(get_abs_path("data/measure/quarterly_units_list.tsv")) + quarterly_units_graph = pynini.union(quarterly_units_map, quarterly_units_list) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, @@ -65,16 +84,28 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): # Define the quarterly measurements quarter = pynini.string_map( [ - (".५", "साढ़े"), - ("१.५", "डेढ़"), - ("२.५", "ढाई"), + (HI_POINT_FIVE, HI_SADHE), + (HI_ONE_POINT_FIVE, HI_DEDH), + (HI_TWO_POINT_FIVE, HI_DHAI), ] ) quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\"") # Define the unit handling - unit = pynutil.insert(" units: \"") + unit_graph + pynutil.insert("\" ") - units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ") + unit = ( + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + + unit_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + units = ( + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + + quarterly_units_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) # Handling symbols like x, X, * symbol_graph = pynini.string_map( @@ -94,24 +125,43 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + unit ) - dedh_dhai = pynini.string_map([("१.५", "डेढ़"), ("२.५", "ढाई")]) + dedh_dhai = pynini.string_map([(HI_ONE_POINT_FIVE, HI_DEDH), (HI_TWO_POINT_FIVE, HI_DHAI)]) dedh_dhai_graph = pynutil.insert("integer: \"") + dedh_dhai + pynutil.insert("\"") - savva_numbers = cardinal_graph + pynini.cross(".२५", "") - savva_graph = pynutil.insert("integer: \"सवा ") + savva_numbers + pynutil.insert("\"") + savva_numbers = cardinal_graph + pynini.cross(HI_DECIMAL_25, "") + savva_graph = ( + pynutil.insert("integer: \"") + + pynutil.insert(HI_SAVVA) + + pynutil.insert(NEMO_SPACE) + + savva_numbers + + pynutil.insert("\"") + ) - sadhe_numbers = cardinal_graph + pynini.cross(".५", "") - sadhe_graph = pynutil.insert("integer: \"साढ़े ") + sadhe_numbers + pynutil.insert("\"") + sadhe_numbers = cardinal_graph + pynini.cross(HI_POINT_FIVE, "") + sadhe_graph = ( + pynutil.insert("integer: \"") + + pynutil.insert(HI_SADHE) + + pynutil.insert(NEMO_SPACE) + + sadhe_numbers + + pynutil.insert("\"") + ) paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) - paune_numbers = paune + pynini.cross(".७५", "") - paune_graph = pynutil.insert("integer: \"पौने ") + paune_numbers + pynutil.insert("\"") + paune_numbers = paune + pynini.cross(HI_DECIMAL_75, "") + paune_graph = ( + pynutil.insert("integer: \"") + + pynutil.insert(HI_PAUNE) + + pynutil.insert(NEMO_SPACE) + + paune_numbers + + pynutil.insert("\"") + ) graph_dedh_dhai = ( pynutil.insert("cardinal { ") + optional_graph_negative + dedh_dhai_graph - + pynutil.insert(" }") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + delete_space + units ) @@ -120,7 +170,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): pynutil.insert("cardinal { ") + optional_graph_negative + savva_graph - + pynutil.insert(" }") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + delete_space + units ) @@ -129,7 +180,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): pynutil.insert("cardinal { ") + optional_graph_negative + sadhe_graph - + pynutil.insert(" }") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + delete_space + units ) @@ -149,7 +201,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert("integer: \"") + cardinal_graph + pynutil.insert("\"") - + pynutil.insert(" }") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + delete_space + unit ) @@ -162,9 +215,11 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + cardinal_graph + pynutil.insert("\"") + pynutil.insert(" }") - + pynutil.insert(" units: \"") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + symbol_graph - + pynutil.insert("\" ") + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + pynutil.insert("} }") + insert_space + pynutil.insert("tokens { cardinal { ") @@ -175,13 +230,13 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) graph = ( - pynutil.add_weight(graph_decimal, 0.01) - | pynutil.add_weight(graph_cardinal, 0.01) - | pynutil.add_weight(graph_exceptions, 0.01) - | pynutil.add_weight(graph_dedh_dhai, 0.001) - | pynutil.add_weight(graph_savva, 0.005) - | pynutil.add_weight(graph_sadhe, 0.005) - | pynutil.add_weight(graph_paune, -0.2) + pynutil.add_weight(graph_decimal, 0.1) + | pynutil.add_weight(graph_cardinal, 0.1) + | pynutil.add_weight(graph_exceptions, 0.1) + | pynutil.add_weight(graph_dedh_dhai, -0.2) + | pynutil.add_weight(graph_savva, -0.1) + | pynutil.add_weight(graph_sadhe, -0.1) + | pynutil.add_weight(graph_paune, -0.5) ) self.graph = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/ordinal.py b/nemo_text_processing/text_normalization/hi/taggers/ordinal.py index 51cbd666a..5f1cefed4 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/ordinal.py @@ -34,9 +34,14 @@ class OrdinalFst(GraphFst): def __init__(self, cardinal: CardinalFst, deterministic: bool = True): super().__init__(name="ordinal", kind="classify", deterministic=deterministic) - suffixes_fst = pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv")) + suffixes_list = pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv")) + suffixes_map = pynini.string_file(get_abs_path("data/ordinal/suffixes_map.tsv")) + suffixes_fst = pynini.union(suffixes_list, suffixes_map) + exceptions = pynini.string_file(get_abs_path("data/ordinal/exceptions.tsv")) graph = cardinal.final_graph + suffixes_fst + exceptions = pynutil.add_weight(exceptions, -0.1) + graph = pynini.union(exceptions, graph) final_graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) diff --git a/nemo_text_processing/text_normalization/hi/taggers/punctuation.py b/nemo_text_processing/text_normalization/hi/taggers/punctuation.py index 8309ba030..14c9a1a55 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/hi/taggers/punctuation.py @@ -36,9 +36,9 @@ def __init__(self, deterministic: bool = True): emphasis = ( pynini.accep("<") - + ( - (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)) - | (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1)) + + pynini.union( + (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)), + (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1)), ) + pynini.accep(">") ) diff --git a/nemo_text_processing/text_normalization/hi/taggers/telephone.py b/nemo_text_processing/text_normalization/hi/taggers/telephone.py index 039e30d74..d20870c0d 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/hi/taggers/telephone.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,7 +28,11 @@ ) from nemo_text_processing.text_normalization.hi.utils import get_abs_path -delete_zero = pynutil.delete(pynini.union("0", "०")) +HI_ZERO_DIGIT = pynini.union("0", "०") +HI_MOBILE_START_DIGITS = pynini.union("६", "७", "८", "९", "6", "7", "8", "9").optimize() +HI_LANDLINE_START_DIGITS = pynini.union("२", "३", "४", "६", "2", "3", "4", "6").optimize() + +delete_zero = pynutil.delete(HI_ZERO_DIGIT) delete_zero_optional = pynini.closure(delete_zero, 0, 1) insert_shunya = pynutil.insert('शून्य') + insert_space @@ -41,16 +45,17 @@ credit_context = pynini.string_file(get_abs_path("data/telephone/credit_context.tsv")) pincode_context = pynini.string_file(get_abs_path("data/telephone/pincode_context.tsv")) +# Reusable optimized graph for any digit token +num_token = pynini.union(digit_to_word, digits, zero).optimize() -def generate_mobile(context_keywords): - context_before, context_after = get_context(context_keywords) - allowed_digits = pynini.union("६", "७", "८", "९", "6", "7", "8", "9") +def generate_mobile(context_keywords: pynini.Fst) -> pynini.Fst: + context_before, context_after = get_context(context_keywords) # Filter cardinals to only include allowed digits - mobile_start_digit = allowed_digits @ digits | allowed_digits @ digit_to_word + mobile_start_digit = pynini.union(HI_MOBILE_START_DIGITS @ digits, HI_MOBILE_START_DIGITS @ digit_to_word) - country_code_digits = pynini.closure((digit_to_word | digits | zero) + insert_space, 1, 3) + country_code_digits = pynini.closure(num_token + insert_space, 1, 3) country_code = ( pynutil.insert("country_code: \"") + context_before @@ -63,7 +68,7 @@ def generate_mobile(context_keywords): extension_optional = pynini.closure( pynutil.insert("extension: \"") - + pynini.closure((digit_to_word | digits | zero) + insert_space, 1, 3) + + pynini.closure(num_token + insert_space, 1, 3) + context_after + pynutil.insert("\" ") + delete_space, @@ -71,7 +76,7 @@ def generate_mobile(context_keywords): 1, ) - number_part = mobile_start_digit + insert_space + pynini.closure((digit_to_word | digits | zero) + insert_space, 9) + number_part = mobile_start_digit + insert_space + pynini.closure(num_token + insert_space, 9) number_without_country = ( pynutil.insert("number_part: \"") @@ -93,31 +98,27 @@ def generate_mobile(context_keywords): + delete_space ) - return (number_with_country | number_without_country) + extension_optional + return (pynini.union(number_with_country, number_without_country) + extension_optional).optimize() -def get_landline(std_length, context_keywords): +def get_landline(std_length: int, context_keywords: pynini.Fst) -> pynini.Fst: context_before, context_after = get_context(context_keywords) - allowed_digits = pynini.union("२", "३", "४", "६", "2", "3", "4", "6") - # Filter cardinals to only include allowed digits - landline_start_digit = allowed_digits @ digits | allowed_digits @ digit_to_word + landline_start_digit = pynini.union(HI_LANDLINE_START_DIGITS @ digits, HI_LANDLINE_START_DIGITS @ digit_to_word) std_code_graph = ( - delete_zero_optional - + insert_shunya - + pynini.closure((digit_to_word | digits | zero) + insert_space, std_length, std_length) + delete_zero_optional + insert_shunya + pynini.closure(num_token + insert_space, std_length, std_length) ) landline_digit_count = 9 - std_length landline_graph = ( landline_start_digit + insert_space - + pynini.closure((digit_to_word | digits | zero) + insert_space, landline_digit_count, landline_digit_count) + + pynini.closure(num_token + insert_space, landline_digit_count, landline_digit_count) ) - separator_optional = pynini.closure(pynini.cross("-", "") | pynini.cross(".", ""), 0, 1) + separator_optional = pynini.closure(pynini.union(pynini.cross("-", ""), pynini.cross(".", "")), 0, 1) std_code_in_brackets = ( delete_zero_optional @@ -140,10 +141,10 @@ def get_landline(std_length, context_keywords): + landline_graph + context_after + pynutil.insert("\" ") - ) + ).optimize() -def generate_landline(context_keywords): +def generate_landline(context_keywords: pynini.Fst) -> pynini.Fst: graph = ( get_landline(2, context_keywords) | get_landline(3, context_keywords) @@ -153,10 +154,10 @@ def generate_landline(context_keywords): | get_landline(7, context_keywords) ) - return graph + return graph.optimize() -def get_context(keywords: list): +def get_context(keywords: pynini.Fst): all_digits = pynini.union(NEMO_HI_DIGIT, NEMO_DIGIT) @@ -172,28 +173,28 @@ def get_context(keywords: list): return before.optimize(), after.optimize() -def generate_credit(context_keywords): +def generate_credit(context_keywords: pynini.Fst) -> pynini.Fst: context_before, context_after = get_context(context_keywords) return ( pynutil.insert("number_part: \"") + context_before - + pynini.closure((digit_to_word | digits | zero) + insert_space, 4) + + pynini.closure(num_token + insert_space, 4) + context_after + pynutil.insert("\" ") + delete_space - ) + ).optimize() -def generate_pincode(context_keywords): +def generate_pincode(context_keywords: pynini.Fst) -> pynini.Fst: context_before, context_after = get_context(context_keywords) return ( pynutil.insert("number_part: \"") + context_before - + pynini.closure((digit_to_word | digits | zero) + insert_space, 6) + + pynini.closure(num_token + insert_space, 6) + context_after + pynutil.insert("\" ") + delete_space - ) + ).optimize() class TelephoneFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hi/taggers/time.py b/nemo_text_processing/text_normalization/hi/taggers/time.py index e78b31380..09defaab2 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/text_normalization/hi/taggers/time.py @@ -15,9 +15,24 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space +from nemo_text_processing.text_normalization.hi.graph_utils import ( + HI_DEDH, + HI_DHAI, + HI_PAUNE, + HI_SADHE, + HI_SAVVA, + NEMO_SPACE, + GraphFst, + insert_space, +) from nemo_text_processing.text_normalization.hi.utils import get_abs_path +# Time patterns specific to time tagger +HI_DOUBLE_ZERO = "००" +HI_TIME_FIFTEEN = ":१५" # :15 +HI_TIME_THIRTY = ":३०" # :30 +HI_TIME_FORTYFIVE = ":४५" # :45 + hours_graph = pynini.string_file(get_abs_path("data/time/hours.tsv")) minutes_graph = pynini.string_file(get_abs_path("data/time/minutes.tsv")) seconds_graph = pynini.string_file(get_abs_path("data/time/seconds.tsv")) @@ -55,36 +70,56 @@ def __init__(self, cardinal: GraphFst): graph_hm = self.hours + delete_colon + insert_space + self.minutes # hour - graph_h = self.hours + delete_colon + pynutil.delete("००") + graph_h = self.hours + delete_colon + pynutil.delete(HI_DOUBLE_ZERO) - dedh_dhai_graph = pynini.string_map([("१:३०", "डेढ़"), ("२:३०", "ढाई")]) + dedh_dhai_graph = pynini.string_map([("१" + HI_TIME_THIRTY, HI_DEDH), ("२" + HI_TIME_THIRTY, HI_DHAI)]) - savva_numbers = cardinal_graph + pynini.cross(":१५", "") - savva_graph = pynutil.insert("सवा ") + savva_numbers + savva_numbers = cardinal_graph + pynini.cross(HI_TIME_FIFTEEN, "") + savva_graph = pynutil.insert(HI_SAVVA) + pynutil.insert(NEMO_SPACE) + savva_numbers - sadhe_numbers = cardinal_graph + pynini.cross(":३०", "") - sadhe_graph = pynutil.insert("साढ़े ") + sadhe_numbers + sadhe_numbers = cardinal_graph + pynini.cross(HI_TIME_THIRTY, "") + sadhe_graph = pynutil.insert(HI_SADHE) + pynutil.insert(NEMO_SPACE) + sadhe_numbers paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) - paune_numbers = paune + pynini.cross(":४५", "") - paune_graph = pynutil.insert("पौने ") + paune_numbers - - graph_dedh_dhai = pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph + pynutil.insert("\" ") + paune_numbers = paune + pynini.cross(HI_TIME_FORTYFIVE, "") + paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers + + graph_dedh_dhai = ( + pynutil.insert("morphosyntactic_features: \"") + + dedh_dhai_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) - graph_savva = pynutil.insert("morphosyntactic_features: \"") + savva_graph + pynutil.insert("\" ") + graph_savva = ( + pynutil.insert("morphosyntactic_features: \"") + + savva_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) - graph_sadhe = pynutil.insert("morphosyntactic_features: \"") + sadhe_graph + pynutil.insert("\" ") + graph_sadhe = ( + pynutil.insert("morphosyntactic_features: \"") + + sadhe_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) - graph_paune = pynutil.insert("morphosyntactic_features: \"") + paune_graph + pynutil.insert("\" ") + graph_paune = ( + pynutil.insert("morphosyntactic_features: \"") + + paune_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) final_graph = ( graph_hms - | pynutil.add_weight(graph_hm, 0.01) - | pynutil.add_weight(graph_h, 0.01) - | pynutil.add_weight(graph_dedh_dhai, 0.001) - | pynutil.add_weight(graph_savva, 0.005) - | pynutil.add_weight(graph_sadhe, 0.005) - | pynutil.add_weight(graph_paune, 0.001) + | pynutil.add_weight(graph_hm, 0.3) + | pynutil.add_weight(graph_h, 0.3) + | pynutil.add_weight(graph_dedh_dhai, 0.1) + | pynutil.add_weight(graph_savva, 0.2) + | pynutil.add_weight(graph_sadhe, 0.2) + | pynutil.add_weight(graph_paune, 0.1) ) final_graph = self.add_tokens(final_graph) diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index ceaf74689..e3e6fc5d8 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -14,7 +14,6 @@ import logging import os -import time import pynini from pynini.lib import pynutil @@ -80,61 +79,39 @@ def __init__( else: logging.info(f"Creating ClassifyFst grammars.") - start_time = time.time() cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst - logging.debug(f"cardinal: {time.time() - start_time: .2f}s -- {cardinal_graph.num_states()} nodes") - start_time = time.time() decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst - logging.debug(f"decimal: {time.time() - start_time: .2f}s -- {decimal_graph.num_states()} nodes") - start_time = time.time() fraction = FractionFst(cardinal=cardinal, deterministic=deterministic) fraction_graph = fraction.fst - logging.debug(f"fraction: {time.time() - start_time: .2f}s -- {fraction_graph.num_states()} nodes") - start_time = time.time() date = DateFst(cardinal=cardinal) date_graph = date.fst - logging.debug(f"date: {time.time() - start_time: .2f}s -- {date_graph.num_states()} nodes") - start_time = time.time() timefst = TimeFst(cardinal=cardinal) time_graph = timefst.fst - logging.debug(f"time: {time.time() - start_time: .2f}s -- {time_graph.num_states()} nodes") - start_time = time.time() measure = MeasureFst(cardinal=cardinal, decimal=decimal) measure_graph = measure.fst - logging.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") - start_time = time.time() money = MoneyFst(cardinal=cardinal) money_graph = money.fst - logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes") - start_time = time.time() ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) ordinal_graph = ordinal.fst - logging.debug(f"ordinal: {time.time() - start_time: .2f}s -- {ordinal_graph.num_states()} nodes") - start_time = time.time() whitelist_graph = WhiteListFst( input_case=input_case, deterministic=deterministic, input_file=whitelist ).fst - logging.debug(f"whitelist: {time.time() - start_time: .2f}s -- {whitelist_graph.num_states()} nodes") - start_time = time.time() punctuation = PunctuationFst(deterministic=deterministic) punct_graph = punctuation.fst - logging.debug(f"punct: {time.time() - start_time: .2f}s -- {punct_graph.num_states()} nodes") - start_time = time.time() telephone = TelephoneFst() telephone_graph = telephone.fst - logging.debug(f"telephone: {time.time() - start_time: .2f}s -- {telephone_graph.num_states()} nodes") classify = ( pynutil.add_weight(whitelist_graph, 1.01) @@ -149,18 +126,18 @@ def __init__( | pynutil.add_weight(ordinal_graph, 1.1) ) - start_time = time.time() word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).fst - logging.debug(f"word: {time.time() - start_time: .2f}s -- {word_graph.num_states()} nodes") punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") punct = pynini.closure( - pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(NEMO_SPACE) + punct), + pynini.union( + pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space), + (pynutil.insert(NEMO_SPACE) + punct), + ), 1, ) - classify |= pynutil.add_weight(word_graph, 100) + classify = pynini.union(classify, pynutil.add_weight(word_graph, 100)) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( pynini.closure(punct + pynutil.insert(NEMO_SPACE)) @@ -169,15 +146,15 @@ def __init__( ) graph = token_plus_punct + pynini.closure( - ( - pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(NEMO_SPACE) + punct + pynutil.insert(NEMO_SPACE)) + pynini.union( + pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space), + (pynutil.insert(NEMO_SPACE) + punct + pynutil.insert(NEMO_SPACE)), ) + token_plus_punct ) graph = delete_space + graph + delete_space - graph |= punct + graph = pynini.union(graph, punct) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/word.py b/nemo_text_processing/text_normalization/hi/taggers/word.py index 151a72e99..00feb1827 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/word.py +++ b/nemo_text_processing/text_normalization/hi/taggers/word.py @@ -40,9 +40,9 @@ def __init__(self, punctuation: PunctuationFst, deterministic: bool = True): # Define Hindi characters and symbols using pynini.union HINDI_CHAR = pynini.union( - *[chr(i) for i in range(ord("ऀ"), ord("ः") + 1)], # Hindi vowels and consonants - *[chr(i) for i in range(ord("अ"), ord("ह") + 1)], # More Hindi characters - *[chr(i) for i in range(ord("ा"), ord("्") + 1)], # Hindi diacritics + *[chr(i) for i in range(0x0900, 0x0903 + 1)], # Hindi vowels and consonants + *[chr(i) for i in range(0x0905, 0x0939 + 1)], # More Hindi characters + *[chr(i) for i in range(0x093E, 0x094D + 1)], # Hindi diacritics ).optimize() # Include punctuation in the graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py index a6a677ec3..55ebeab01 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt index d1a072d0c..9bdcab2a4 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt @@ -1,5 +1,15 @@ +१ला~पहला +१ली~पहली +२रा~दूसरा +२री~दूसरी +३रा~तीसरा +३री~तीसरी +४था~चौथा +४थी~चौथी ५वां~पाँचवां ५वीं~पाँचवीं +६ठा~छठा +६ठी~छठी ७वां~सातवां ७वीं~सातवीं ८वां~आठवां