Coverage for functions \ flipdare \ core \ tokenizer.py: 91%

96 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2026-05-08 12:22 +1000

1#!/usr/bin/env python 

2# Copyright (c) 2026 Flipdare Pty Ltd. All rights reserved. 

3# 

4# This file is part of Flipdare's proprietary software and contains 

5# confidential and copyrighted material. Unauthorised copying, 

6# modification, distribution, or use of this file is strictly 

7# prohibited without prior written permission from Flipdare Pty Ltd. 

8# 

9# This software includes third-party components licensed under MIT, 

10# BSD, and Apache 2.0 licences. See THIRD_PARTY_NOTICES for details. 

11# 

12 

13 

14from dataclasses import dataclass 

15import spacy 

16 

17from flipdare.core.singleton import Singleton 

18from flipdare.generated.shared.model.token_score import TokenScore 

19 

20__all__ = ["Tokenizer", "TokenizerResult"] 

21 

22 

23@dataclass 

24class TokenizerResult: 

25 tokens: list[str] 

26 token_score: TokenScore 

27 

28 

29@dataclass 

30class Tokens: 

31 low: list[str] 

32 med: list[str] 

33 high: list[str] 

34 

35 @property 

36 def has_tokens(self) -> bool: 

37 return len(self.low) > 0 or len(self.med) > 0 or len(self.high) > 0 

38 

39 @property 

40 def low_only(self) -> bool: 

41 return len(self.med) == 0 and len(self.high) == 0 

42 

43 @property 

44 def has_low(self) -> bool: 

45 return len(self.low) > 0 

46 

47 @property 

48 def has_med(self) -> bool: 

49 return len(self.med) > 0 

50 

51 @property 

52 def has_high(self) -> bool: 

53 return len(self.high) > 0 

54 

55 

56class Tokenizer(Singleton): 

57 

58 def __init__(self, nlp: spacy.language.Language | None = None) -> None: 

59 if nlp is None: 

60 nlp = spacy.load("en_core_web_sm") 

61 self._nlp = nlp 

62 

63 @property 

64 def nlp(self) -> spacy.language.Language: 

65 return self._nlp 

66 

67 def is_person_name(self, text: str) -> bool: 

68 doc = self._nlp(text) 

69 # Check if any entity found is a PERSON 

70 return any(ent.label_ == "PERSON" for ent in doc.ents) 

71 

72 def create_tokens(self, value: str | list[str]) -> TokenizerResult: 

73 entries: list[str] = [] 

74 fallback: str 

75 

76 if isinstance(value, str): 

77 fallback = value 

78 entries.append(value) 

79 else: 

80 fallback = " ".join(value) 

81 entries.extend(value) 

82 

83 tokens: list[str] = [] 

84 max_score = TokenScore.LOW 

85 for entry in entries: 

86 tag_results = self._get_tokens(entry) 

87 for tag_result in tag_results: 

88 if tag_result.has_low: 

89 tokens.extend(tag_result.low) 

90 

91 if tag_result.has_med: 

92 tokens.extend(tag_result.med) 

93 if max_score != TokenScore.HIGH: 

94 max_score = TokenScore.MEDIUM 

95 

96 if tag_result.has_high: 

97 tokens.extend(tag_result.high) 

98 max_score = TokenScore.HIGH 

99 

100 # remove duplicates while preserving order 

101 tokens = list(dict.fromkeys(tokens)) 

102 if len(tokens) == 0: 

103 return TokenizerResult(tokens=[fallback], token_score=TokenScore.LOW) 

104 

105 return TokenizerResult(tokens=tokens, token_score=max_score) 

106 

107 def _get_tokens(self, value: str | list[str]) -> "list[Tokens]": 

108 values: list[str] = [] 

109 if isinstance(value, str): 

110 values.append(value) 

111 else: 

112 values.extend(value) 

113 

114 result: list[Tokens] = [] 

115 for val in values: 

116 tokens, named_entities = self._generate(val) 

117 

118 if len(tokens) == 0 and len(named_entities) == 0: 

119 # LOG().warning(f'No tokens found for "{val}"') 

120 result.append(Tokens([val], [], [])) 

121 continue 

122 

123 # LOG().debug(f"Found {len(tokens)} tokens and " 

124 # f"{len(named_entities)} named entities for value '{value}'") 

125 

126 result.append(Tokens([], tokens, named_entities)) 

127 

128 return result 

129 

130 def _generate(self, text: str) -> tuple[list[str], list[str]]: 

131 # Process the original text to get named entities with full context 

132 doc = self._nlp(text) 

133 

134 # Extract named entities first (with original context) 

135 named_entities: list[str] = [] 

136 entity_token_texts: set[str] = set() 

137 

138 for ent in doc.ents: 

139 # Get the entity text without stop words/punctuation 

140 entity_tokens = [ 

141 token.text for token in ent if not token.is_stop and not token.is_punct 

142 ] 

143 if entity_tokens: 

144 named_text = " ".join(entity_tokens) 

145 named_entities.append(named_text) 

146 # Track individual tokens that are part of entities 

147 entity_token_texts.update(entity_tokens) 

148 

149 # Extract regular tokens (non-stop, non-punct, not part of named entities) 

150 # This preserves the original left-to-right order from the source text 

151 found_tokens: list[str] = [] 

152 for token in doc: 

153 if token.is_stop or token.is_punct: 

154 continue 

155 token_text = token.text.strip() 

156 if token_text not in entity_token_texts: 

157 found_tokens.append(token_text) 

158 

159 # LOG().debug(f'Named Entities: {named_entities} Tokens: {found_tokens}') 

160 return found_tokens, named_entities