summaryrefslogtreecommitdiff
path: root/regparse.c
diff options
authorMari Imaizumi <mariimaizumi5@gmail.com>2025-02-24 12:09:26 +0900
committerMari Imaizumi <mariimaizumi5@gmail.com>2025-03-18 21:18:12 +0900
commit6670926a91734ddb92d01ce4578b1bb48d26b7de (patch)
tree5a7fa801f4fa5f236c772589ad386cc16f01b7e3 /regparse.c
parente63c516046b6dbf2f684454b68013b4eea12e94a (diff)
Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker.
https://www.unicode.org/reports/tr29/tr29-43.html#GB9c
Notes
Notes: Merged: https://github.com/ruby/ruby/pull/12798
Diffstat (limited to 'regparse.c')
-rw-r--r--regparse.c72
1 files changed, 57 insertions, 15 deletions
diff --git a/regparse.c b/regparse.c
index 9653a9e1ba..c7587b53b7 100644
--- a/regparse.c
+++ b/regparse.c
@@ -5950,19 +5950,21 @@ create_node_from_array(int kind, Node **np, Node **node_array)
* nodes of the source to NULL_NODE, we can overlap the target array
* as long as we do not override the actual target location.
*
- * Target Array name Index
+ * Target Array name Index
*
- * node_array 0 1 2 3 4 5 6 7 8 9 A B C D E F
- * top_alts alts[5] 0 1 2 3 4*
- * alts+1 list[4] 0 1 2 3*
- * list+1 core_alts[7] 0 1 2 3 4 5 6*
- * core_alts+0 H_list[4] 0 1 2 3*
- * H_list+1 H_alt2[4] 0 1 2 3*
- * h_alt2+1 H_list2[3] 0 1 2*
- * core_alts+4 XP_list[4] 0 1 2 3*
- * XP_list+1 Ex_list[4] 0 1 2 3*
+ * node_array 0 1 2 3 4 5 6 7 8 9 A B C D E F G H
+ * top_alts alts[5] 0 1 2 3 4*
+ * alts+2 list[4] 0 1 2 3*
+ * list+1 core_alts[8] 0 1 2 3 4 5 6 7*
+ * core_alts+0 H_list[4] 0 1 2 3*
+ * H_list+1 H_alt2[4] 0 1 2 3*
+ * H_alt2+1 H_list2[3] 0 1 2*
+ * core_alts+4 XP_list[3] 0 1 2*
+ * XP_list+1 Ex_list[4] 0 1 2 3*
+ * core_alts+5 CC_list[3] 0 1 2*
+ * CC_list+1 CC_inner_list[5] 0 1 2 3 4*
*/
-#define NODE_COMMON_SIZE 16
+#define NODE_COMMON_SIZE 18
static int
node_extended_grapheme_cluster(Node** np, ScanEnv* env)
@@ -6029,9 +6031,10 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
/* core := hangul-syllable
* | ri-sequence
* | xpicto-sequence
+ * | conjunctCluster
* | [^Control CR LF] */
{
- Node **core_alts = list + 2; /* size: 7 */
+ Node **core_alts = list + 2; /* size: 8 */
/* hangul-syllable :=
* L* (V+ | LV V* | LVT) T*
@@ -6099,10 +6102,49 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
R_ERR(create_node_from_array(LIST, core_alts+4, XP_list));
}
+ /* conjunctCluster := \p{InCB=Consonant} ([\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Linker} [\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Consonant})+ */
+ {
+ // \p{InCB=Consonant}
+ Node **CC_list = core_alts + 6; /* size: 3 */
+ R_ERR(create_property_node(CC_list+0, env, "InCB=Consonant"));
+
+ {
+ Node **CC_inner_list = CC_list + 2; /* size: 5 */
+ {
+ // [\p{InCB=Extend} \p{InCB=Linker}]*
+ R_ERR(create_property_node(CC_inner_list+0, env, "InCB=Extend"));
+ R_ERR(add_property_to_cc(NCCLASS(CC_inner_list[0]), "InCB=Linker", 0, env));
+ R_ERR(quantify_node(CC_inner_list+0, 0, REPEAT_INFINITE));
+ }
+
+ // \p{InCB=Linker}
+ R_ERR(create_property_node(CC_inner_list+1, env, "InCB=Linker"));
+
+ {
+ // [\p{InCB=Extend} \p{InCB=Linker}]*
+ R_ERR(create_property_node(CC_inner_list+2, env, "InCB=Extend"));
+ R_ERR(add_property_to_cc(NCCLASS(CC_inner_list[2]), "InCB=Linker", 0, env));
+ R_ERR(quantify_node(CC_inner_list+2, 0, REPEAT_INFINITE));
+ }
+
+ // \p{InCB=Consonant}
+ R_ERR(create_property_node(CC_inner_list+3, env, "InCB=Consonant"));
+
+ // ([\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Linker} [\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Consonant})
+ R_ERR(create_node_from_array(LIST, CC_list+1, CC_inner_list));
+
+ // (...)+
+ R_ERR(quantify_node(CC_list+1, 1, REPEAT_INFINITE));
+ }
+
+ // \p{InCB=Consonant} ([\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Linker} [\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Consonant})+
+ R_ERR(create_node_from_array(LIST, core_alts+5, CC_list));
+ }
+
/* [^Control CR LF] */
- core_alts[5] = node_new_cclass();
- if (IS_NULL(core_alts[5])) goto err;
- cc = NCCLASS(core_alts[5]);
+ core_alts[6] = node_new_cclass();
+ if (IS_NULL(core_alts[6])) goto err;
+ cc = NCCLASS(core_alts[6]);
if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
BBuf *inverted_buf = NULL;
close