Skip to main content

fotos_lib/ai/
pii.rs

1/// PII (Personally Identifiable Information) detection.
2///
3/// Runs regex pattern matching on OCR-extracted text regions to identify
4/// sensitive information and return bounding boxes for each match.
5use anyhow::Result;
6use regex::Regex;
7
8pub struct PiiMatch {
9    pub x: u32,
10    pub y: u32,
11    pub w: u32,
12    pub h: u32,
13    pub pii_type: String,
14    pub text: String,
15}
16
17/// Detect PII in OCR-extracted word regions.
18///
19/// Concatenates all region text into one string (tracking each word's byte
20/// offsets), runs each regex against the full text, then unions the bounding
21/// boxes of every word that overlaps each match.
22pub fn detect_pii(ocr_regions: &[super::ocr::OcrRegion]) -> Result<Vec<PiiMatch>> {
23    if ocr_regions.is_empty() {
24        return Ok(vec![]);
25    }
26
27    // Build a single string from all word regions, tracking each word's
28    // byte offsets so we can map regex matches back to pixel coordinates.
29    struct WordSpan {
30        start: usize,
31        end: usize,
32        region_idx: usize,
33    }
34
35    let mut full_text = String::new();
36    let mut word_spans: Vec<WordSpan> = Vec::new();
37
38    for (i, region) in ocr_regions.iter().enumerate() {
39        let start = full_text.len();
40        full_text.push_str(&region.text);
41        let end = full_text.len();
42        word_spans.push(WordSpan {
43            start,
44            end,
45            region_idx: i,
46        });
47        full_text.push(' ');
48    }
49
50    // (pii_type, regex_pattern)
51    let patterns: &[(&str, &str)] = &[
52        ("email", r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}"),
53        (
54            "phone",
55            r"(?:\+1[\s\-]?)?\(?\d{3}\)?[\s\-]?\d{3}[\s\-]?\d{4}",
56        ),
57        ("ssn", r"\b\d{3}-\d{2}-\d{4}\b"),
58        ("credit_card", r"\b(?:\d{4}[\s\-]?){3}\d{4}\b"),
59        (
60            "ip_v4",
61            r"\b(?:25[0-5]|2\d{2}|1\d{2}|[1-9]\d|\d)(?:\.(?:25[0-5]|2\d{2}|1\d{2}|[1-9]\d|\d)){3}\b",
62        ),
63        ("ip_v6", r"\b(?:[0-9a-fA-F]{1,4}:){2,7}[0-9a-fA-F]{0,4}\b"),
64        (
65            "api_key",
66            r"\b(?:sk|pk)[-_][a-zA-Z0-9]{16,}|ghp_[a-zA-Z0-9]{36}|AKIA[A-Z0-9]{16}\b",
67        ),
68        ("url", r"https?://[^\s]+"),
69    ];
70
71    let mut matches = Vec::new();
72
73    for &(pii_type, pattern) in patterns {
74        let re = Regex::new(pattern)?;
75        for mat in re.find_iter(&full_text) {
76            let match_start = mat.start();
77            let match_end = mat.end();
78            let matched_text = mat.as_str().trim().to_string();
79
80            // Credit cards: validate with Luhn algorithm to cut false positives.
81            if pii_type == "credit_card" {
82                let digits: String = matched_text
83                    .chars()
84                    .filter(|c| c.is_ascii_digit())
85                    .collect();
86                if !luhn_valid(&digits) {
87                    continue;
88                }
89            }
90
91            // Find every word span that overlaps this match.
92            let overlapping: Vec<&WordSpan> = word_spans
93                .iter()
94                .filter(|span| span.start < match_end && span.end > match_start)
95                .collect();
96
97            if overlapping.is_empty() {
98                continue;
99            }
100
101            // Union all overlapping word bounding boxes.
102            let bbox = overlapping
103                .iter()
104                .map(|span| &ocr_regions[span.region_idx])
105                .fold(None::<(u32, u32, u32, u32)>, |acc, r| {
106                    let x2 = r.x + r.w;
107                    let y2 = r.y + r.h;
108                    Some(match acc {
109                        None => (r.x, r.y, x2, y2),
110                        Some((ax, ay, ax2, ay2)) => {
111                            (ax.min(r.x), ay.min(r.y), ax2.max(x2), ay2.max(y2))
112                        }
113                    })
114                });
115
116            if let Some((x, y, x2, y2)) = bbox {
117                matches.push(PiiMatch {
118                    x,
119                    y,
120                    w: x2 - x,
121                    h: y2 - y,
122                    pii_type: pii_type.to_string(),
123                    text: matched_text,
124                });
125            }
126        }
127    }
128
129    Ok(matches)
130}
131
132/// Luhn algorithm check for credit card numbers.
133fn luhn_valid(digits: &str) -> bool {
134    if digits.len() < 13 || digits.len() > 19 {
135        return false;
136    }
137    let sum: u32 = digits
138        .chars()
139        .rev()
140        .enumerate()
141        .map(|(i, c)| {
142            let d = c.to_digit(10).unwrap_or(0);
143            if i % 2 == 1 {
144                let doubled = d * 2;
145                if doubled > 9 {
146                    doubled - 9
147                } else {
148                    doubled
149                }
150            } else {
151                d
152            }
153        })
154        .sum();
155    sum.is_multiple_of(10)
156}
157
158#[cfg(test)]
159mod tests {
160    use super::*;
161    use crate::ai::ocr::OcrRegion;
162
163    fn region(text: &str, x: u32, y: u32, w: u32, h: u32) -> OcrRegion {
164        OcrRegion {
165            text: text.into(),
166            x,
167            y,
168            w,
169            h,
170            confidence: 1.0,
171        }
172    }
173
174    #[test]
175    fn empty_regions_returns_empty() {
176        let result = detect_pii(&[]).unwrap();
177        assert!(result.is_empty());
178    }
179
180    #[test]
181    fn detects_email() {
182        let regions = [region("user@example.com", 0, 0, 100, 20)];
183        let matches = detect_pii(&regions).unwrap();
184        assert!(matches.iter().any(|m| m.pii_type == "email"));
185    }
186
187    #[test]
188    fn detects_ssn() {
189        let regions = [region("123-45-6789", 10, 20, 80, 16)];
190        let matches = detect_pii(&regions).unwrap();
191        assert!(matches.iter().any(|m| m.pii_type == "ssn"));
192    }
193
194    #[test]
195    fn detects_ipv4() {
196        let regions = [region("192.168.1.1", 0, 0, 60, 16)];
197        let matches = detect_pii(&regions).unwrap();
198        assert!(matches.iter().any(|m| m.pii_type == "ip_v4"));
199    }
200
201    #[test]
202    fn detects_url() {
203        let regions = [region("https://example.com/path", 0, 0, 150, 16)];
204        let matches = detect_pii(&regions).unwrap();
205        assert!(matches.iter().any(|m| m.pii_type == "url"));
206    }
207
208    #[test]
209    fn detects_api_key_sk() {
210        let regions = [region("sk-abcdefghijklmnopqrstuvwxyz123456", 0, 0, 200, 16)];
211        let matches = detect_pii(&regions).unwrap();
212        assert!(
213            matches.iter().any(|m| m.pii_type == "api_key"),
214            "{:?}",
215            matches.iter().map(|m| &m.pii_type).collect::<Vec<_>>()
216        );
217    }
218
219    #[test]
220    fn detects_akia_key() {
221        let regions = [region("AKIAIOSFODNN7EXAMPLE", 0, 0, 200, 16)];
222        let matches = detect_pii(&regions).unwrap();
223        assert!(matches.iter().any(|m| m.pii_type == "api_key"));
224    }
225
226    #[test]
227    fn credit_card_luhn_valid_detected() {
228        // Visa test number: 4111111111111111 (Luhn valid)
229        let regions = [region("4111111111111111", 0, 0, 120, 16)];
230        let matches = detect_pii(&regions).unwrap();
231        assert!(matches.iter().any(|m| m.pii_type == "credit_card"));
232    }
233
234    #[test]
235    fn credit_card_luhn_invalid_skipped() {
236        // Invalid card number
237        let regions = [region("1234567890123456", 0, 0, 120, 16)];
238        let matches = detect_pii(&regions).unwrap();
239        assert!(!matches.iter().any(|m| m.pii_type == "credit_card"));
240    }
241
242    #[test]
243    fn multi_word_match_unions_bboxes() {
244        // Email split across two words due to OCR tokenization
245        // (single word here, verifying bbox is correct)
246        let regions = [region("admin@corp.io", 5, 10, 90, 18)];
247        let matches = detect_pii(&regions).unwrap();
248        let m = matches.iter().find(|m| m.pii_type == "email").unwrap();
249        assert_eq!(m.x, 5);
250        assert_eq!(m.y, 10);
251        assert_eq!(m.w, 90);
252        assert_eq!(m.h, 18);
253    }
254
255    #[test]
256    fn luhn_valid_visa_test_card() {
257        assert!(luhn_valid("4111111111111111"));
258    }
259
260    #[test]
261    fn luhn_invalid_random_digits() {
262        assert!(!luhn_valid("1234567890123456"));
263    }
264
265    #[test]
266    fn luhn_rejects_wrong_length() {
267        assert!(!luhn_valid("123"));
268        assert!(!luhn_valid("12345678901234567890"));
269    }
270}