1use anyhow::Result;
6use regex::Regex;
7
8pub struct PiiMatch {
9 pub x: u32,
10 pub y: u32,
11 pub w: u32,
12 pub h: u32,
13 pub pii_type: String,
14 pub text: String,
15}
16
17pub fn detect_pii(ocr_regions: &[super::ocr::OcrRegion]) -> Result<Vec<PiiMatch>> {
23 if ocr_regions.is_empty() {
24 return Ok(vec![]);
25 }
26
27 struct WordSpan {
30 start: usize,
31 end: usize,
32 region_idx: usize,
33 }
34
35 let mut full_text = String::new();
36 let mut word_spans: Vec<WordSpan> = Vec::new();
37
38 for (i, region) in ocr_regions.iter().enumerate() {
39 let start = full_text.len();
40 full_text.push_str(®ion.text);
41 let end = full_text.len();
42 word_spans.push(WordSpan {
43 start,
44 end,
45 region_idx: i,
46 });
47 full_text.push(' ');
48 }
49
50 let patterns: &[(&str, &str)] = &[
52 ("email", r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}"),
53 (
54 "phone",
55 r"(?:\+1[\s\-]?)?\(?\d{3}\)?[\s\-]?\d{3}[\s\-]?\d{4}",
56 ),
57 ("ssn", r"\b\d{3}-\d{2}-\d{4}\b"),
58 ("credit_card", r"\b(?:\d{4}[\s\-]?){3}\d{4}\b"),
59 (
60 "ip_v4",
61 r"\b(?:25[0-5]|2\d{2}|1\d{2}|[1-9]\d|\d)(?:\.(?:25[0-5]|2\d{2}|1\d{2}|[1-9]\d|\d)){3}\b",
62 ),
63 ("ip_v6", r"\b(?:[0-9a-fA-F]{1,4}:){2,7}[0-9a-fA-F]{0,4}\b"),
64 (
65 "api_key",
66 r"\b(?:sk|pk)[-_][a-zA-Z0-9]{16,}|ghp_[a-zA-Z0-9]{36}|AKIA[A-Z0-9]{16}\b",
67 ),
68 ("url", r"https?://[^\s]+"),
69 ];
70
71 let mut matches = Vec::new();
72
73 for &(pii_type, pattern) in patterns {
74 let re = Regex::new(pattern)?;
75 for mat in re.find_iter(&full_text) {
76 let match_start = mat.start();
77 let match_end = mat.end();
78 let matched_text = mat.as_str().trim().to_string();
79
80 if pii_type == "credit_card" {
82 let digits: String = matched_text
83 .chars()
84 .filter(|c| c.is_ascii_digit())
85 .collect();
86 if !luhn_valid(&digits) {
87 continue;
88 }
89 }
90
91 let overlapping: Vec<&WordSpan> = word_spans
93 .iter()
94 .filter(|span| span.start < match_end && span.end > match_start)
95 .collect();
96
97 if overlapping.is_empty() {
98 continue;
99 }
100
101 let bbox = overlapping
103 .iter()
104 .map(|span| &ocr_regions[span.region_idx])
105 .fold(None::<(u32, u32, u32, u32)>, |acc, r| {
106 let x2 = r.x + r.w;
107 let y2 = r.y + r.h;
108 Some(match acc {
109 None => (r.x, r.y, x2, y2),
110 Some((ax, ay, ax2, ay2)) => {
111 (ax.min(r.x), ay.min(r.y), ax2.max(x2), ay2.max(y2))
112 }
113 })
114 });
115
116 if let Some((x, y, x2, y2)) = bbox {
117 matches.push(PiiMatch {
118 x,
119 y,
120 w: x2 - x,
121 h: y2 - y,
122 pii_type: pii_type.to_string(),
123 text: matched_text,
124 });
125 }
126 }
127 }
128
129 Ok(matches)
130}
131
132fn luhn_valid(digits: &str) -> bool {
134 if digits.len() < 13 || digits.len() > 19 {
135 return false;
136 }
137 let sum: u32 = digits
138 .chars()
139 .rev()
140 .enumerate()
141 .map(|(i, c)| {
142 let d = c.to_digit(10).unwrap_or(0);
143 if i % 2 == 1 {
144 let doubled = d * 2;
145 if doubled > 9 {
146 doubled - 9
147 } else {
148 doubled
149 }
150 } else {
151 d
152 }
153 })
154 .sum();
155 sum.is_multiple_of(10)
156}
157
158#[cfg(test)]
159mod tests {
160 use super::*;
161 use crate::ai::ocr::OcrRegion;
162
163 fn region(text: &str, x: u32, y: u32, w: u32, h: u32) -> OcrRegion {
164 OcrRegion {
165 text: text.into(),
166 x,
167 y,
168 w,
169 h,
170 confidence: 1.0,
171 }
172 }
173
174 #[test]
175 fn empty_regions_returns_empty() {
176 let result = detect_pii(&[]).unwrap();
177 assert!(result.is_empty());
178 }
179
180 #[test]
181 fn detects_email() {
182 let regions = [region("user@example.com", 0, 0, 100, 20)];
183 let matches = detect_pii(®ions).unwrap();
184 assert!(matches.iter().any(|m| m.pii_type == "email"));
185 }
186
187 #[test]
188 fn detects_ssn() {
189 let regions = [region("123-45-6789", 10, 20, 80, 16)];
190 let matches = detect_pii(®ions).unwrap();
191 assert!(matches.iter().any(|m| m.pii_type == "ssn"));
192 }
193
194 #[test]
195 fn detects_ipv4() {
196 let regions = [region("192.168.1.1", 0, 0, 60, 16)];
197 let matches = detect_pii(®ions).unwrap();
198 assert!(matches.iter().any(|m| m.pii_type == "ip_v4"));
199 }
200
201 #[test]
202 fn detects_url() {
203 let regions = [region("https://example.com/path", 0, 0, 150, 16)];
204 let matches = detect_pii(®ions).unwrap();
205 assert!(matches.iter().any(|m| m.pii_type == "url"));
206 }
207
208 #[test]
209 fn detects_api_key_sk() {
210 let regions = [region("sk-abcdefghijklmnopqrstuvwxyz123456", 0, 0, 200, 16)];
211 let matches = detect_pii(®ions).unwrap();
212 assert!(
213 matches.iter().any(|m| m.pii_type == "api_key"),
214 "{:?}",
215 matches.iter().map(|m| &m.pii_type).collect::<Vec<_>>()
216 );
217 }
218
219 #[test]
220 fn detects_akia_key() {
221 let regions = [region("AKIAIOSFODNN7EXAMPLE", 0, 0, 200, 16)];
222 let matches = detect_pii(®ions).unwrap();
223 assert!(matches.iter().any(|m| m.pii_type == "api_key"));
224 }
225
226 #[test]
227 fn credit_card_luhn_valid_detected() {
228 let regions = [region("4111111111111111", 0, 0, 120, 16)];
230 let matches = detect_pii(®ions).unwrap();
231 assert!(matches.iter().any(|m| m.pii_type == "credit_card"));
232 }
233
234 #[test]
235 fn credit_card_luhn_invalid_skipped() {
236 let regions = [region("1234567890123456", 0, 0, 120, 16)];
238 let matches = detect_pii(®ions).unwrap();
239 assert!(!matches.iter().any(|m| m.pii_type == "credit_card"));
240 }
241
242 #[test]
243 fn multi_word_match_unions_bboxes() {
244 let regions = [region("admin@corp.io", 5, 10, 90, 18)];
247 let matches = detect_pii(®ions).unwrap();
248 let m = matches.iter().find(|m| m.pii_type == "email").unwrap();
249 assert_eq!(m.x, 5);
250 assert_eq!(m.y, 10);
251 assert_eq!(m.w, 90);
252 assert_eq!(m.h, 18);
253 }
254
255 #[test]
256 fn luhn_valid_visa_test_card() {
257 assert!(luhn_valid("4111111111111111"));
258 }
259
260 #[test]
261 fn luhn_invalid_random_digits() {
262 assert!(!luhn_valid("1234567890123456"));
263 }
264
265 #[test]
266 fn luhn_rejects_wrong_length() {
267 assert!(!luhn_valid("123"));
268 assert!(!luhn_valid("12345678901234567890"));
269 }
270}