Skip to content

edsnlp.pipes.ner.tnm.patterns_new

tumour_pattern = ( r"(?P[cpyramP]{1,2}\s?)?" # Optional tumour prefix r"T\s?" # 'T' followed by optional space r"(?P([0-4]|is|[Xx]|[Oo]))" # Tumour size (required if 'T' is present) r"(?:\s?(?P[abcdxm]|mi))?" # Optional tumour specification r"(?:\s?((?P[^()]{1,10})))?" # Optional tumour suffix )

node_pattern = ( r"(?P[cpyraP]{1,2}\s?)?" # Optional node prefix r"N\s?" # 'N' followed by optional space r"(?P[Xx01234+]|[Oo])" # Node size/status (required if 'N' is present) r"(?:\s?(?P" r"[abcdxm]|mi|sn|i[-,+]|mol[-,+]|(mi)|(sn)|" r"(i[-,+])|(mol[-,+])|(\d+\s/\s\d+)))?" # Optional specification r"(?:\s?((?P[^()]{1,10})))?" # Optional suffix )

metastasis_pattern = ( r"(?P[cpyraP]{1,2}\s?)?" # Optional metastasis prefix r"M\s?" # 'M' followed by optional space r"(?P[Xx0123+]|[Oo])" # Metastasis status (required if 'M' is present) r"(?:\s?(?P" r"[abcdm]|i+|mol+|cy+|(i+)|(mol+)|" r"(cy+)|PUL|OSS|HEP|BRA|LYM|OTH|MAR|PLE|PER|ADR|SKI))?" # Optional specification )

pleura_pattern = ( r"PL\s?(?P([0123]|x))?" # Optional pleura status (for lung cancer) )

resection_pattern = ( r"R\s?" r"(?P[Xx012+])" # Resection completeness r"(?:\s?(?Pis|cy+|(is)|(cy+)))?" # Optional spec r"(?:\s?(?P(((?P[a-z]+))[,;\s])))?" # Optional loc )

version_pattern = ( r"(?(?Puicc|accj|tnm|UICC|ACCJ|TNM)" # TNM version r"\s+([éeE]ditions|[éeE]d.?)?\s*" r"(?P\d{4}|\d{2}))?" # Year of the version )

TNM_space = r"(\s[,\/]?\s| )" # Allow space, comma, or slash as delimiters

We need te exclude pattern like 'T1', 'T2' if they are not followed by node or

metastasis sections.

exclude_pattern = ( r"(?!" r"(?:[cpyramP]{0,2}\s)?" # Optional prefix like p, yp, PT r"T\s" r"(?:[0-4]|is|[xXoO])" # T stage (includes is, x, o) r"(?:[abcdx]|mi)?" # Optional specification r"(?:\s([^()]{1,10}))?" # Optional suffix r"(?:\s[\s,\/.()]|$)" # <-- KEY ADDITION: allow end-of-string ($) r"(?!\s*" + node_pattern + "?" + TNM_space + "?" + metastasis_pattern + "?" + ")" + ")" )

tnm_pattern_new = ( r"(?:|^)" + exclude_pattern + r"(?:" + r"(?P" + tumour_pattern + ")" + TNM_space + "?" + r"(?P" + node_pattern + ")?" + TNM_space + "?" + r"(?P" + metastasis_pattern + ")?" + TNM_space + "?" + r"(?P" + pleura_pattern + ")?" + TNM_space + "?" + r"(?P" + resection_pattern + ")?" + TNM_space + "?" + r"(?P" + version_pattern + ")?" + r")" + r"(?=[\s().,;:/]|$)" # + r"(?:|$| )" )