11 Commits

Author SHA1 Message Date
  Ron Shakutai 8c453d4948 remove benchmark script. 9 hours ago
  Ron Shakutai c46929fd05
Merge branch 'main' into ronshakutai/gpu-optimizations 9 hours ago
  dependabot[bot] e0109d405c
Bump actions/cache from 4 to 5 (#1817) 9 hours ago
  Ron Shakutai 8d7804dca4 Add comprehensive error path mocking for device_detector and spacy GPU config tests 9 hours ago
  Ron Shakutai 754349be5e test: enhance GPU detection tests for DeviceDetector and SpacyNlpEngine 10 hours ago
  Ron Shakutai 0509b6a824 docs added around gpu 10 hours ago
  Ron Shakutai 03742ea2fd fix: reorder device_detector import for consistency 10 hours ago
  Ron Shakutai 62a1185844 Merge branch 'main' of https://github.com/microsoft/presidio into ronshakutai/gpu-optimizations 10 hours ago
  Ron Shakutai ffe66e0ded refactor: remove unused GPU result files and update device detector usage 10 hours ago
  Hoa Ngo 586eaa8083
fix(analyzer): Pass map_location to GLiNER.from_pretrained for GPU support (#1813) 1 day ago
  Ron Shakutai 2ac7a320ce
Change parameters in extraction in langextract (#1811) 4 days ago
27 changed files with 1147 additions and 892 deletions
Split View
  1. +2
    -2
      .github/workflows/ci.yml
  2. +0
    -606
      benchmark_presidio.py
  3. +19
    -0
      docs/getting_started/getting_started_text.md
  4. +16
    -0
      docs/installation.md
  5. +16
    -0
      presidio-analyzer/README.md
  6. +0
    -24
      presidio-analyzer/gpu_gliner_results.json
  7. +0
    -24
      presidio-analyzer/gpu_spacy_results.json
  8. +0
    -24
      presidio-analyzer/gpu_stanza_results.json
  9. +0
    -24
      presidio-analyzer/gpu_trans_results.json
  10. +7
    -15
      presidio-analyzer/presidio_analyzer/conf/langextract_config_azureopenai.yaml
  11. +11
    -1
      presidio-analyzer/presidio_analyzer/conf/langextract_config_ollama.yaml
  12. +2
    -2
      presidio-analyzer/presidio_analyzer/nlp_engine/__init__.py
  13. +35
    -60
      presidio-analyzer/presidio_analyzer/nlp_engine/device_detector.py
  14. +2
    -3
      presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py
  15. +94
    -12
      presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py
  16. +0
    -10
      presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py
  17. +5
    -3
      presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py
  18. +21
    -39
      presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/azure_openai_langextract_recognizer.py
  19. +54
    -4
      presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/langextract_recognizer.py
  20. +22
    -36
      presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/ollama_langextract_recognizer.py
  21. +3
    -1
      presidio-analyzer/pyproject.toml
  22. +130
    -0
      presidio-analyzer/tests/test_azure_openai_langextract_recognizer.py
  23. +187
    -0
      presidio-analyzer/tests/test_device_detector.py
  24. +0
    -1
      presidio-analyzer/tests/test_gliner_recognizer.py
  25. +133
    -0
      presidio-analyzer/tests/test_ollama_recognizer.py
  26. +49
    -1
      presidio-analyzer/tests/test_spacy_nlp_engine.py
  27. +339
    -0
      presidio-analyzer/tests/test_stanza_batch_processing.py

+ 2
- 2
.github/workflows/ci.yml View File

@@ -327,7 +327,7 @@ jobs:
python-version: '3.10'

- name: Cache E2E dependencies
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: |
~/.cache/pip
@@ -415,7 +415,7 @@ jobs:
python-version: '3.10'

- name: Cache E2E dependencies
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: |
~/.cache/pip


+ 0
- 606
benchmark_presidio.py View File

@@ -1,606 +0,0 @@
#!/usr/bin/env python3
"""Comprehensive benchmark script for Presidio Analyzer performance testing.

Tests different dataset sizes and NLP engines (spaCy, Transformers, GLiNER).
Generates a markdown report.
"""

import argparse
import json
import logging
import sys
import time
import warnings

from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.batch_analyzer_engine import BatchAnalyzerEngine

# Configure logging - suppress presidio-analyzer INFO logs
logging.basicConfig(
level=logging.ERROR,
format='%(levelname)s - %(name)s - %(message)s',
stream=sys.stderr
)

# Suppress warnings from spacy_huggingface_pipelines and other libraries
warnings.filterwarnings('ignore')

# Optional imports for different NLP engines
try:
from presidio_analyzer.nlp_engine import NlpEngineProvider, TransformersNlpEngine
from presidio_analyzer.nlp_engine.ner_model_configuration import (
NerModelConfiguration,
)
HAS_TRANSFORMERS = True
except ImportError:
HAS_TRANSFORMERS = False

try:
from presidio_analyzer.predefined_recognizers import GLiNERRecognizer
HAS_GLINER = True
except ImportError:
HAS_GLINER = False

try:
from presidio_analyzer.nlp_engine import StanzaNlpEngine
HAS_STANZA = True
except ImportError:
HAS_STANZA = False

# Sample texts for testing - large dataset
TEST_TEXT_TEMPLATES = [
(
"My name is {name} and my email is {email}. "
"I work at {company} as a software engineer."
),
"Patient information: Name: {name}, SSN: {ssn}, Phone: {phone}, Address: {address}",
(
"Dear {name}, your account {email} has been verified. "
"Contact us at {phone} for support."
),
"Employee ID: {id}, Name: {name}, Credit Card: {cc}, Expires: {exp_date}",
"Contact {name} at {phone} or email {email}. Office located at {address}.",
(
"Medical record for {name}, born {dob}. "
"Insurance details: Policy #{id}, contact {phone}."
),
"Transaction approved for {name}. Card ending {cc_last4}. Receipt sent to {email}.",
(
"Hello {name}, your appointment at {address} is confirmed "
"for {date} at {time}. Call {phone} if needed."
),
"User profile: {name}, Username: {email}, Phone: {phone}, Registered: {date}",
(
"Billing statement for {name} at {address}. Amount due: $2,500. "
"Questions? Email {email} or call {phone}."
),
(
"Dear Dr. {name}, patient consultation scheduled {date}. "
"Patient contact: {phone}, Address: {address}"
),
(
"Account #{id} for {name} ({email}) shows activity on {date}. "
"Security code sent to {phone}."
),
(
"Prescription refill for {name}, DOB: {dob}. Pharmacy: {address}. "
"Insurance verification needed, call {phone}."
),
(
"Welcome {name}! Your credit card {cc} has been added. "
"Billing address: {address}. Contact: {email}"
),
(
"Invoice #{id} - {name}, {company}. Payment to {address}. "
"Due {date}. Support: {email}/{phone}"
),
]

NAMES = [
"John Smith",
"Sarah Johnson",
"Michael Brown",
"Emily Davis",
"James Wilson",
"Jessica Martinez",
"David Anderson",
"Jennifer Taylor",
"Robert Thomas",
"Mary Garcia",
"Christopher Lee",
"Patricia Rodriguez",
"Daniel White",
"Linda Harris",
"Matthew Clark",
"Barbara Lewis",
"Joseph Walker",
"Susan Hall",
"Charles Allen",
"Karen Young",
]

EMAILS = [
"john.smith@example.com",
"sarah.j@company.org",
"mbrown@corp.net",
"emily.davis@mail.com",
"jwilson@business.io",
"jmartinez@enterprise.com",
"david.a@startup.tech",
"jtaylor@firm.law",
"rthomas@clinic.med",
"mgarcia@university.edu",
"clee@consulting.biz",
"prodriguez@agency.gov",
"dwhite@financial.com",
"lharris@retail.store",
"mclark@manufacturing.ind",
"blewis@services.pro",
"jwalker@healthcare.org",
"shall@education.edu",
"callen@technology.io",
"kyoung@pharma.com",
]

PHONES = [
"555-123-4567",
"555-234-5678",
"555-345-6789",
"555-456-7890",
"555-567-8901",
"+1-555-678-9012",
"+1-202-555-0173",
"555-789-0123",
"555-890-1234",
"555-901-2345",
"+1-415-555-0198",
"+1-310-555-0142",
"555-111-2222",
"555-222-3333",
"555-333-4444",
"+1-713-555-0156",
"+1-617-555-0187",
"555-444-5555",
"555-555-6666",
"555-666-7777",
]

SSNS = [
"123-45-6789", "234-56-7890", "345-67-8901", "456-78-9012", "567-89-0123",
"678-90-1234", "789-01-2345", "890-12-3456", "901-23-4567", "012-34-5678",
"111-22-3333", "222-33-4444", "333-44-5555", "444-55-6666", "555-66-7777",
"666-77-8888", "777-88-9999", "888-99-0000", "999-00-1111", "000-11-2222"
]

ADDRESSES = [
"123 Main St, New York, NY 10001", "456 Oak Ave, Los Angeles, CA 90012",
"789 Pine Rd, Chicago, IL 60601", "321 Elm St, Houston, TX 77001",
"654 Maple Dr, Phoenix, AZ 85001", "987 Cedar Ln, Philadelphia, PA 19101",
"147 Birch Way, San Antonio, TX 78201", "258 Spruce Ct, San Diego, CA 92101",
"369 Willow Pl, Dallas, TX 75201", "741 Ash Blvd, San Jose, CA 95101",
"852 Hickory St, Austin, TX 78701", "963 Walnut Ave, Jacksonville, FL 32099",
"159 Chestnut Rd, Fort Worth, TX 76101", "357 Magnolia Dr, Columbus, OH 43004",
"486 Sycamore Ln, Charlotte, NC 28201"
]

CREDIT_CARDS = [
"4532-1234-5678-9010", "5425-2345-6789-0123", "3782-345678-90123",
"6011-4567-8901-2345", "3056-567890-1234", "4916-6789-0123-4567",
"5412-7890-1234-5678", "3714-890123-45678", "6011-9012-3456-7890"
]

DATES = [
"01/15/2024", "02/20/2024", "03/25/2024", "04/10/2024", "05/18/2024",
"06/22/2024", "07/30/2024", "08/14/2024", "09/05/2024", "10/12/2024",
"11/28/2024", "12/31/2024"
]

TIMES = [
"10:30 AM",
"2:15 PM",
"9:00 AM",
"4:45 PM",
"11:20 AM",
"3:30 PM",
"8:15 AM",
]
DOBS = [
"05/15/1985",
"08/22/1990",
"03/10/1978",
"11/05/1982",
"07/30/1995",
"12/18/1988",
]


def generate_test_texts(count):
"""Generate test texts with PII."""
texts = []
for i in range(count):
template = TEST_TEXT_TEMPLATES[i % len(TEST_TEXT_TEMPLATES)]
text = template.format(
name=NAMES[i % len(NAMES)],
email=EMAILS[i % len(EMAILS)],
phone=PHONES[i % len(PHONES)],
ssn=SSNS[i % len(SSNS)],
address=ADDRESSES[i % len(ADDRESSES)],
company=f"Company{i % 50}",
id=f"EMP{10000 + i}",
cc=CREDIT_CARDS[i % len(CREDIT_CARDS)],
cc_last4=str(1000 + i % 9000),
exp_date=DATES[i % len(DATES)],
date=DATES[i % len(DATES)],
time=TIMES[i % len(TIMES)],
dob=DOBS[i % len(DOBS)],
)
texts.append(text)
return texts


def create_transformers_analyzer():
"""Create an analyzer with Transformers NLP engine."""
if not HAS_TRANSFORMERS:
raise ImportError(
"Transformers support not available. "
"Install with: pip install 'presidio-analyzer[transformers]'"
)

# Use simple configuration (same as previous working version)
# This gives better performance than loading from config file
model_config = [{
"lang_code": "en",
"model_name": {
"spacy": "en_core_web_sm",
"transformers": "StanfordAIMI/stanford-deidentifier-base"
}
}]

# Entity mapping from official transformers.yaml config
mapping = {
"PER": "PERSON",
"PERSON": "PERSON",
"LOC": "LOCATION",
"LOCATION": "LOCATION",
"GPE": "LOCATION",
"ORG": "ORGANIZATION",
"ORGANIZATION": "ORGANIZATION",
"NORP": "NRP",
"AGE": "AGE",
"ID": "ID",
"EMAIL": "EMAIL",
"PATIENT": "PERSON",
"STAFF": "PERSON",
"HOSP": "ORGANIZATION",
"PATORG": "ORGANIZATION",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
"PHONE": "PHONE_NUMBER",
"HCW": "PERSON",
"HOSPITAL": "LOCATION",
"FACILITY": "LOCATION",
"VENDOR": "ORGANIZATION",
}

ner_model_configuration = NerModelConfiguration(
model_to_presidio_entity_mapping=mapping,
alignment_mode="strict", # faster than expand
aggregation_strategy="simple", # faster than max
labels_to_ignore=["O"]
)

nlp_engine = TransformersNlpEngine(
models=model_config,
ner_model_configuration=ner_model_configuration
)

return AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])


def create_gliner_analyzer():
"""Create an analyzer with GLiNER recognizer."""
if not HAS_GLINER:
raise ImportError(
"GLiNER support not available. "
"Install with: pip install 'presidio-analyzer[gliner]'"
)

# Use small spaCy model (we don't need spaCy's NER)
nlp_config = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
}

provider = NlpEngineProvider(nlp_configuration=nlp_config)
nlp_engine = provider.create_engine()

analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])

# Entity mapping for GLiNER
entity_mapping = {
"person": "PERSON",
"name": "PERSON",
"organization": "ORGANIZATION",
"location": "LOCATION",
"phone number": "PHONE_NUMBER",
"email": "EMAIL_ADDRESS",
"email address": "EMAIL_ADDRESS",
"credit card number": "CREDIT_CARD",
"social security number": "US_SSN",
"date of birth": "DATE_TIME",
"address": "LOCATION",
}

# Create GLiNER recognizer - will auto-detect GPU via DeviceDetector
gliner_recognizer = GLiNERRecognizer(
model_name="urchade/gliner_multi_pii-v1",
entity_mapping=entity_mapping,
flat_ner=False,
multi_label=True,
)

# Add GLiNER and remove spaCy NER recognizer
analyzer.registry.add_recognizer(gliner_recognizer)
analyzer.registry.remove_recognizer("SpacyRecognizer")

return analyzer


def create_stanza_analyzer():
"""Create an analyzer with Stanza NLP engine."""
if not HAS_STANZA:
raise ImportError(
"Stanza support not available. "
"Install with: pip install 'presidio-analyzer[stanza]'"
)

# Entity mapping from stanza.yaml config
mapping = {
"PER": "PERSON",
"PERSON": "PERSON",
"NORP": "NRP",
"FAC": "LOCATION",
"LOC": "LOCATION",
"LOCATION": "LOCATION",
"GPE": "LOCATION",
"ORG": "ORGANIZATION",
"ORGANIZATION": "ORGANIZATION",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
}

ner_model_configuration = NerModelConfiguration(
model_to_presidio_entity_mapping=mapping,
labels_to_ignore=["O"]
)

# Create Stanza NLP engine with GPU support
nlp_engine = StanzaNlpEngine(
models=[{"lang_code": "en", "model_name": "en"}],
ner_model_configuration=ner_model_configuration
)

return AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["en"])


def run_benchmark(num_texts, batch_size, engine_type="spacy"):
"""Run benchmark for a specific dataset size and NLP engine.

Args:
num_texts: Number of texts to process
batch_size: Batch size for processing
engine_type: Type of NLP engine - "spacy", "transformers", or "gliner"
"""
print(f"\n{'='*80}")
print(
f"Running benchmark: {num_texts} texts, "
f"batch_size={batch_size}, engine={engine_type}"
)
print('='*80)

# Generate texts
print(f"Generating {num_texts} test texts...")
texts = generate_test_texts(num_texts)

# Initialize analyzer based on engine type
print(f"Initializing AnalyzerEngine ({engine_type})...")
start_init = time.time()

if engine_type == "transformers":
analyzer = create_transformers_analyzer()
elif engine_type == "gliner":
analyzer = create_gliner_analyzer()
elif engine_type == "stanza":
analyzer = create_stanza_analyzer()
else: # spacy (default)
analyzer = AnalyzerEngine()

batch_analyzer = BatchAnalyzerEngine(analyzer)
init_time = time.time() - start_init
print(f" Initialization: {init_time:.2f}s")

# Warm-up
print("Warm-up run...")
start_warmup = time.time()
_ = batch_analyzer.analyze_iterator(
texts=texts[:min(10, num_texts)],
language="en",
batch_size=batch_size,
)
warmup_time = time.time() - start_warmup
print(f" Warm-up: {warmup_time:.2f}s")

# Main benchmark
print(f"Processing {num_texts} texts...")
start_analysis = time.time()
results = batch_analyzer.analyze_iterator(
texts=texts,
language="en",
batch_size=batch_size,
)
total_analysis_time = time.time() - start_analysis

total_entities = sum(len(result) for result in results)
avg_time = total_analysis_time / num_texts
throughput = num_texts / total_analysis_time

print(f" Complete: {total_analysis_time:.2f}s")
print(f" Throughput: {throughput:.2f} texts/second")
print(f" Entities found: {total_entities}")

return {
"num_texts": num_texts,
"batch_size": batch_size,
"engine_type": engine_type,
"init_time": init_time,
"warmup_time": warmup_time,
"total_time": total_analysis_time,
"avg_time_ms": avg_time * 1000,
"throughput": throughput,
"total_entities": total_entities,
}


def main():
"""Run comprehensive benchmarks on Presidio Analyzer engines."""
parser = argparse.ArgumentParser(
description="Comprehensive Presidio Analyzer performance benchmark"
)
parser.add_argument(
"--json",
type=str,
default="benchmark_results.json",
help="Save results as JSON to this file (default: benchmark_results.json)",
)
parser.add_argument(
"--engines",
type=str,
default="spacy",
help=(
"Comma-separated list of engines to test: "
"spacy,transformers,gliner,stanza (default: spacy)"
),
)
parser.add_argument(
"--sizes",
type=str,
default="50,500,5000",
help="Comma-separated list of dataset sizes to test (default: 50,500,5000)",
)
args = parser.parse_args()

# Parse engines to test
requested_engines = [e.strip() for e in args.engines.split(',')]
available_engines = []

for engine in requested_engines:
if engine == "spacy":
available_engines.append("spacy")
elif engine == "transformers":
if HAS_TRANSFORMERS:
available_engines.append("transformers")
else:
print(
"⚠️ Transformers engine requested but not available. "
"Install with: pip install 'presidio-analyzer[transformers]'"
)
elif engine == "gliner":
if HAS_GLINER:
available_engines.append("gliner")
else:
print(
"⚠️ GLiNER engine requested but not available. "
"Install with: pip install 'presidio-analyzer[gliner]'"
)
elif engine == "stanza":
if HAS_STANZA:
available_engines.append("stanza")
else:
print(
"⚠️ Stanza engine requested but not available. "
"Install with: pip install 'presidio-analyzer[stanza]'"
)
else:
print(f"⚠️ Unknown engine: {engine}. Skipping.")

if not available_engines:
print("❌ No valid engines available. Exiting.")
sys.exit(1)

# Parse dataset sizes
try:
dataset_sizes = [int(s.strip()) for s in args.sizes.split(',')]
except ValueError:
print(
"❌ Invalid dataset sizes format. "
"Use comma-separated integers (e.g., 50,500,5000)"
)
sys.exit(1)

# Auto-adjust batch sizes based on dataset size
def get_batch_size(num_texts):
return 16

# Create test configurations
test_configs = []
for engine in available_engines:
for size in dataset_sizes:
batch_size = get_batch_size(size)
test_configs.append((size, batch_size, engine))

print("="*80)
print("PRESIDIO ANALYZER COMPREHENSIVE BENCHMARK")
print("="*80)
print(f"\nEngines to test: {', '.join(available_engines)}")
print(f"Dataset sizes: {', '.join(str(s) for s in dataset_sizes)}")
print(f"Total tests: {len(test_configs)}")
print("This may take several minutes...\n")

all_results = []

for num_texts, batch_size, engine in test_configs:
try:
result = run_benchmark(num_texts, batch_size, engine)
all_results.append(result)
except KeyboardInterrupt:
print("\n\n⚠️ Benchmark interrupted by user")
if all_results:
print("Generating partial results...")
else:
print("No results to save.")
sys.exit(1)
break
except Exception as e:
print(
f"\n❌ Error running benchmark for {num_texts} texts "
f"with {engine} engine: {e}"
)
import traceback
traceback.print_exc()
continue

if all_results:
# Save JSON results
with open(args.json, 'w') as f:
json.dump(all_results, f, indent=2)
print(f"✅ JSON results saved to: {args.json}")

print("\n" + "="*80)
print("BENCHMARK COMPLETE")
print("="*80)
else:
print("\n❌ No results collected")
sys.exit(1)


if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n\n⚠️ Benchmark interrupted by user")
sys.exit(1)
except Exception as e:
print(f"\n❌ Fatal error: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)

+ 19
- 0
docs/getting_started/getting_started_text.md View File

@@ -93,6 +93,25 @@ Using Presidio's modules as Python packages to get started:
!!! tip "Tip: Downloading models"
If not available, the transformers model and the spacy model would be downloaded on the first call to the `AnalyzerEngine`. To pre-download, see [this doc](../analyzer/nlp_engines/transformers.md#downloading-a-pre-trained-model).

=== "GPU Acceleration (Optional)"

GPU support is available for spaCy, Stanza, Transformers, and GLiNER.

- Install the `gpu` extra (includes `cupy-cuda12x`):
```sh
pip install "presidio-analyzer[gpu]"
pip install presidio-anonymizer
```

- Combine with other extras:
```sh
pip install "presidio-analyzer[transformers,gpu]"
pip install presidio-anonymizer
python -m spacy download en_core_web_sm
```

- Or install your own CUDA dependencies

## Simple flow - Docker container

Presidio provides Docker containers that you can use to de-identify text data. Each module, analyzer, and anonymizer, has its own Docker container. The containers are available on Docker Hub.


+ 16
- 0
docs/installation.md View File

@@ -61,6 +61,22 @@ with at least one NLP engine (`spaCy`, `transformers` or `stanza`):
Stanza models would be loaded lazily. To pre-load them, see: [Downloading a pre-trained model](./analyzer/nlp_engines/spacy_stanza.md#download-the-pre-trained-model).
### GPU acceleration (optional)
For GPU acceleration with spaCy, Stanza, Transformers, or GLiNER:
- Install the `gpu` extra (includes `cupy-cuda12x`):
```sh
pip install "presidio-analyzer[gpu]"
```
- Combine with other extras:
```sh
pip install "presidio-analyzer[transformers,gpu]"
```
- Or install your own CUDA dependencies
### PII redaction in images
For PII redaction in images


+ 16
- 0
presidio-analyzer/README.md View File

@@ -84,6 +84,22 @@ print(results)

```

## GPU Acceleration

GPU support is available for spaCy, Stanza, Transformers, and GLiNER.

- Install the `gpu` extra (includes `cupy-cuda12x`):
```bash
pip install "presidio-analyzer[gpu]"
```

- Combine with other extras:
```bash
pip install "presidio-analyzer[transformers,gpu]"
```

- Or install your own CUDA dependencies

## Documentation

Additional documentation on installation, usage and extending the Analyzer can be found under the [Analyzer](https://microsoft.github.io/presidio/analyzer/) section of [Presidio Documentation](https://microsoft.github.io/presidio)

+ 0
- 24
presidio-analyzer/gpu_gliner_results.json View File

@@ -1,24 +0,0 @@
[
{
"num_texts": 50,
"batch_size": 16,
"engine_type": "gliner",
"init_time": 9.893409252166748,
"warmup_time": 2.4725661277770996,
"total_time": 2.848795175552368,
"avg_time_ms": 56.97590351104736,
"throughput": 17.551279372096392,
"total_entities": 242
},
{
"num_texts": 500,
"batch_size": 16,
"engine_type": "gliner",
"init_time": 9.43524694442749,
"warmup_time": 0.5841579437255859,
"total_time": 45.80630612373352,
"avg_time_ms": 91.61261224746704,
"throughput": 10.915527627339854,
"total_entities": 2416
}
]

+ 0
- 24
presidio-analyzer/gpu_spacy_results.json View File

@@ -1,24 +0,0 @@
[
{
"num_texts": 50,
"batch_size": 16,
"engine_type": "spacy",
"init_time": 3.232339859008789,
"warmup_time": 1.648664951324463,
"total_time": 0.5361835956573486,
"avg_time_ms": 10.723671913146973,
"throughput": 93.25164067860219,
"total_entities": 235
},
{
"num_texts": 500,
"batch_size": 16,
"engine_type": "spacy",
"init_time": 1.908512830734253,
"warmup_time": 0.2146310806274414,
"total_time": 4.622352361679077,
"avg_time_ms": 9.244704723358154,
"throughput": 108.17003137735138,
"total_entities": 2377
}
]

+ 0
- 24
presidio-analyzer/gpu_stanza_results.json View File

@@ -1,24 +0,0 @@
[
{
"num_texts": 50,
"batch_size": 16,
"engine_type": "stanza",
"init_time": 5.2999162673950195,
"warmup_time": 2.0750410556793213,
"total_time": 7.569249153137207,
"avg_time_ms": 151.38498306274414,
"throughput": 6.60567501325764,
"total_entities": 253
},
{
"num_texts": 500,
"batch_size": 16,
"engine_type": "stanza",
"init_time": 6.010739803314209,
"warmup_time": 2.8395984172821045,
"total_time": 160.41057419776917,
"avg_time_ms": 320.82114839553833,
"throughput": 3.1170014975668203,
"total_entities": 2510
}
]

+ 0
- 24
presidio-analyzer/gpu_trans_results.json View File

@@ -1,24 +0,0 @@
[
{
"num_texts": 50,
"batch_size": 16,
"engine_type": "transformers",
"init_time": 1.8468782901763916,
"warmup_time": 1.5114922523498535,
"total_time": 0.7709858417510986,
"avg_time_ms": 15.419716835021973,
"throughput": 64.85203397047823,
"total_entities": 273
},
{
"num_texts": 500,
"batch_size": 16,
"engine_type": "transformers",
"init_time": 1.2162683010101318,
"warmup_time": 0.1569383144378662,
"total_time": 7.970991134643555,
"avg_time_ms": 15.941982269287111,
"throughput": 62.72745654262466,
"total_entities": 2746
}
]

+ 7
- 15
presidio-analyzer/presidio_analyzer/conf/langextract_config_azureopenai.yaml View File

@@ -1,17 +1,8 @@
# Azure OpenAI Configuration for LangExtract
#
# This config file is OPTIONAL for basic usage. You can pass model_id and credentials
# as parameters instead of using this file.
#
# Use this file when you need to customize:
# - Supported entities
# - Entity mappings
# - Prompts and examples
# - Detection parameters
#
# IMPORTANT: The model_id below is a placeholder. You can:
# 1. Pass model_id as a parameter: AzureOpenAILangExtractRecognizer(model_id="your-deployment-name")
# 2. OR update model_id below to match your Azure OpenAI deployment name
# Required parameters: model_id (deployment name)
# Auth parameters: azure_endpoint, api_key (via constructor or environment variables)
# Optional parameters use defaults if not specified

lm_recognizer:
supported_entities:
@@ -41,11 +32,12 @@ langextract:
examples_file: "presidio-analyzer/presidio_analyzer/conf/langextract_prompts/default_pii_phi_examples.yaml"
model:
# Azure OpenAI deployment name (e.g., "gpt-4", "gpt-4o", "my-gpt-deployment")
# This is the deployment name from Azure Portal, NOT the model name
# You can override this by passing model_id parameter to the recognizer
model_id: "gpt-4o"
temperature: null
# Optional: Uncomment to override defaults
# fence_output: true
# use_schema_constraints: false
entity_mappings:
person: PERSON


+ 11
- 1
presidio-analyzer/presidio_analyzer/conf/langextract_config_ollama.yaml View File

@@ -1,5 +1,8 @@
# Ollama Configuration
# Ollama Configuration for LangExtract
# https://github.com/google/langextract#using-local-llms-with-ollama
#
# Required parameters: model_id, model_url
# Optional parameters use defaults if not specified

lm_recognizer:
supported_entities:
@@ -30,6 +33,13 @@ langextract:
model_id: "qwen2.5:1.5b"
model_url: "http://localhost:11434"
temperature: null
# Optional: Uncomment to override defaults
# max_char_buffer: 400
# use_schema_constraints: false
# fence_output: false
# timeout: 240
# num_ctx: 8192
entity_mappings:
person: PERSON


+ 2
- 2
presidio-analyzer/presidio_analyzer/nlp_engine/__init__.py View File

@@ -1,6 +1,6 @@
"""NLP engine package. Performs text pre-processing."""

from .device_detector import DeviceDetector
from .device_detector import device_detector
from .ner_model_configuration import NerModelConfiguration
from .nlp_artifacts import NlpArtifacts
from .nlp_engine import NlpEngine
@@ -11,7 +11,7 @@ from .transformers_nlp_engine import TransformersNlpEngine
from .nlp_engine_provider import NlpEngineProvider # isort:skip

__all__ = [
"DeviceDetector",
"device_detector",
"NerModelConfiguration",
"NlpArtifacts",
"NlpEngine",


+ 35
- 60
presidio-analyzer/presidio_analyzer/nlp_engine/device_detector.py View File

@@ -1,4 +1,11 @@
"""GPU/CPU device detection singleton for Presidio NLP engines."""
"""GPU/CPU device detection for Presidio NLP engines.

This module creates a single, process-wide DeviceDetector instance.
Consumers may import and use the shared instance directly.

The detector is initialized once at import time and is intended to be
read-only in practice.
"""

import logging
from typing import Optional
@@ -7,84 +14,52 @@ logger = logging.getLogger("presidio-analyzer")


class DeviceDetector:
"""Singleton for GPU/CPU detection. Lazy initialization on first use."""

_instance: Optional["DeviceDetector"] = None
_torch_initialized: bool = False
_has_torch_gpu: bool = False
_torch_device: str = "cpu"
_torch_device_name: Optional[str] = None
"""Detect and expose PyTorch GPU/CPU availability.

def __new__(cls) -> "DeviceDetector":
"""Return singleton instance and detect torch GPU on first creation."""
if cls._instance is None:
cls._instance = super(DeviceDetector, cls).__new__(cls)
cls._instance._detect_torch_gpu()
return cls._instance
This class performs a one-time detection of CUDA availability and
exposes the result for reuse across the process.
"""

def _detect_torch_gpu(self) -> None:
"""Detect PyTorch GPU/CUDA once."""
if DeviceDetector._torch_initialized:
return
def __init__(self) -> None:
self._device = "cpu"
self._device_name: Optional[str] = None
self._detect()

def _detect(self) -> None:
"""Detect PyTorch CUDA support once."""
try:
import torch

if torch.cuda.is_available():
logger.info("GPU found, attempting CUDA initialization")


try:
# Force CUDA initialization
str(torch.tensor([1.0], device="cuda"))
DeviceDetector._torch_device_name = torch.cuda.get_device_name(0)
_ = str(torch.tensor([1.0], device="cuda"))
self._device_name = torch.cuda.get_device_name(0)
torch.cuda.get_device_capability(0)
torch.cuda.empty_cache()

DeviceDetector._has_torch_gpu = True
DeviceDetector._torch_device = "cuda"
self._device = "cuda"
logger.info(
"GPU and CUDA available. Device: "
f"{DeviceDetector._torch_device_name}"
"CUDA available. Device: %s",
self._device_name,
)

except Exception as e:
logger.warning(f"PyTorch Pre-Check: FAILED with error: {e}")
DeviceDetector._has_torch_gpu = False
DeviceDetector._torch_device = "cpu"
else:
logger.info("No GPU found, using CPU")
DeviceDetector._has_torch_gpu = False
DeviceDetector._torch_device = "cpu"

logger.warning(
"PyTorch CUDA initialization failed, falling back to CPU: %s",
e,
)
except ImportError:
logger.info("PyTorch not available, using CPU")
DeviceDetector._has_torch_gpu = False
DeviceDetector._torch_device = "cpu"

DeviceDetector._torch_initialized = True


def has_torch_gpu(self) -> bool:
"""Return True if PyTorch GPU is available."""
return DeviceDetector._has_torch_gpu

def get_torch_device(self) -> str:
"""Return torch device string: 'cuda:0' or 'cpu'."""
return DeviceDetector._torch_device

def get_torch_device_name(self) -> Optional[str]:
"""Return PyTorch GPU device name or None."""
return DeviceDetector._torch_device_name
def get_device(self) -> str:
"""Return device string ('cuda' or 'cpu')."""
return self._device

def get_torch_device_info(self) -> dict:
"""Return PyTorch device information."""
return {
"has_gpu": DeviceDetector._has_torch_gpu,
"device_name": DeviceDetector._torch_device_name,
"device": DeviceDetector._torch_device,
}
def get_gpu_device_name(self) -> Optional[str]:
"""Return GPU device name if available."""
return self._device_name


# Initialize singleton at module import to preload CUDA libraries if GPU available
DeviceDetector()
# Shared, process-wide instance
device_detector = DeviceDetector()

+ 2
- 3
presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py View File

@@ -7,10 +7,10 @@ from spacy.language import Language
from spacy.tokens import Doc, Span

from presidio_analyzer.nlp_engine import (
DeviceDetector,
NerModelConfiguration,
NlpArtifacts,
NlpEngine,
device_detector,
)

logger = logging.getLogger("presidio-analyzer")
@@ -56,8 +56,7 @@ class SpacyNlpEngine(NlpEngine):
logger.debug(f"Loading SpaCy models: {self.models}")

# Configure GPU if available
device_detector = DeviceDetector()
if device_detector.has_torch_gpu():
if device_detector.get_device() == "cuda":
try:
spacy.require_gpu()
logger.info("spaCy GPU configured successfully")


+ 94
- 12
presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py View File

@@ -1,6 +1,6 @@
import logging
import warnings
from typing import Dict, List, Optional, Union
from typing import Any, Dict, Generator, List, Optional, Tuple, Union

try:
import stanza
@@ -18,9 +18,10 @@ from spacy.tokens import Doc, Token
from spacy.util import registry

from presidio_analyzer.nlp_engine import (
DeviceDetector,
NerModelConfiguration,
NlpArtifacts,
SpacyNlpEngine,
device_detector,
)

logger = logging.getLogger("presidio-analyzer")
@@ -52,21 +53,13 @@ class StanzaNlpEngine(SpacyNlpEngine):
):
super().__init__(models, ner_model_configuration)
self.download_if_missing = download_if_missing
self.use_gpu = device_detector.get_device() == "cuda"

def load(self) -> None:
"""Load the NLP model."""

logger.debug(f"Loading Stanza models: {self.models}")

# Detect GPU availability
device_detector = DeviceDetector()
use_gpu = device_detector.has_torch_gpu()

if use_gpu:
logger.info("Stanza will use GPU")
else:
logger.info("Stanza will use CPU")

self.nlp = {}
for model in self.models:
self._validate_model_params(model)
@@ -76,9 +69,82 @@ class StanzaNlpEngine(SpacyNlpEngine):
download_method="DOWNLOAD_RESOURCES"
if self.download_if_missing
else None,
use_gpu=use_gpu,
use_gpu=self.use_gpu,
)

def process_batch(
self,
texts: Union[List[str], List[Tuple[str, object]]],
language: str,
batch_size: int = 1,
n_process: int = 1,
as_tuples: bool = False,
) -> Generator[
Union[Tuple[Any, NlpArtifacts, Any], Tuple[Any, NlpArtifacts]], Any, None
]:
"""Execute the NLP pipeline on a batch of texts using Stanza's bulk processing.

This method overrides SpacyNlpEngine.process_batch to leverage Stanza's
efficient bulk_process method, which processes multiple documents together
for better GPU utilization.

Note: Stanza batches internally at the sentence/token level, not docs.
For optimal GPU performance, use larger batch sizes (e.g., 16-32 docs).
GPU utilization depends on total sentences/tokens across all docs in batch.

:param texts: A list of texts to process. if as_tuples is set to True,
texts should be a list of tuples (text, context).
:param language: The language of the texts.
:param batch_size: Number of documents per bulk_process call.
Recommended: 16-32+ for GPU, lower values acceptable for CPU.
:param n_process: Not used for Stanza (kept for API compatibility).
:param as_tuples: If set to True, inputs should be a sequence of
(text, context) tuples. Output will then be a sequence of
(text, NlpArtifacts, context) tuples. Defaults to False.

:return: A generator of tuples (text, NlpArtifacts, context) or
(text, NlpArtifacts) depending on the value of as_tuples.
"""

if not self.nlp:
raise ValueError("NLP engine is not loaded. Consider calling .load()")

# Get the StanzaTokenizer (which wraps the Stanza pipeline)
# In spaCy, tokenizers are accessed via .tokenizer, not .get_pipe()
stanza_tokenizer = self.nlp[language].tokenizer
stanza_pipeline = stanza_tokenizer.snlp

# Process texts in batches
text_list = list(texts) if not isinstance(texts, list) else texts

for batch_start in range(0, len(text_list), batch_size):
batch_end = min(batch_start + batch_size, len(text_list))
batch = text_list[batch_start:batch_end]

# Prepare batch for Stanza
if as_tuples:
batch_texts = [str(text) for text, context in batch]
contexts = [context for text, context in batch]
else:
batch_texts = [str(text) for text in batch]
contexts = None

# Create Stanza Document objects and process via bulk_process
# Stanza handles internal batching at sentence/token level
stanza_docs = [stanza.Document([], text=text) for text in batch_texts]
processed_stanza_docs = stanza_pipeline.bulk_process(stanza_docs)

# Convert processed Stanza docs to spaCy docs using spacy-stanza's logic
# We call _convert_doc() which reuses StanzaTokenizer's conversion path
for idx, processed_stanza_doc in enumerate(processed_stanza_docs):
spacy_doc = stanza_tokenizer._convert_doc(processed_stanza_doc)
nlp_artifacts = self._doc_to_nlp_artifact(spacy_doc, language)

if as_tuples:
yield batch_texts[idx], nlp_artifacts, contexts[idx]
else:
yield batch_texts[idx], nlp_artifacts


# Code taken from https://github.com/explosion/spacy-stanza
# Supports Stanza > 1.7.0
@@ -226,6 +292,22 @@ class StanzaTokenizer(object):
return Doc(self.vocab, words=[text], spaces=[False])

snlp_doc = self.snlp(text)
return self._convert_doc(snlp_doc)

def _convert_doc(self, snlp_doc):
"""Convert a processed Stanza Document to a spaCy Doc.

This method contains the conversion logic separated from text processing,
allowing it to be called with already-processed Stanza documents.

:param snlp_doc: Processed Stanza Document
:return: spaCy Doc object
"""
if not snlp_doc.text:
return Doc(self.vocab)
elif snlp_doc.text.isspace():
return Doc(self.vocab, words=[snlp_doc.text], spaces=[False])

text = snlp_doc.text
snlp_tokens, snlp_heads = self.__get_tokens_with_heads(snlp_doc)
pos = []


+ 0
- 10
presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py View File

@@ -12,7 +12,6 @@ except ImportError:
transformers = None

from presidio_analyzer.nlp_engine import (
DeviceDetector,
NerModelConfiguration,
SpacyNlpEngine,
)
@@ -76,15 +75,6 @@ class TransformersNlpEngine(SpacyNlpEngine):

logger.debug(f"Loading SpaCy and transformers models: {self.models}")

# Configure GPU if available
device_detector = DeviceDetector()
if device_detector.has_torch_gpu():
try:
spacy.require_gpu()
logger.info("spaCy GPU configured successfully")
except Exception as e:
logger.warning(f"Failed to configure spaCy for GPU: {e}")

self.nlp = {}

for model in self.models:


+ 5
- 3
presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py View File

@@ -8,9 +8,9 @@ from presidio_analyzer import (
RecognizerResult,
)
from presidio_analyzer.nlp_engine import (
DeviceDetector,
NerModelConfiguration,
NlpArtifacts,
device_detector,
)

try:
@@ -92,7 +92,7 @@ class GLiNERRecognizer(LocalRecognizer):
self.map_location = (
map_location
if map_location is not None
else DeviceDetector().get_torch_device()
else device_detector.get_device()
)

self.flat_ner = flat_ner
@@ -117,7 +117,9 @@ class GLiNERRecognizer(LocalRecognizer):
raise ImportError("GLiNER is not installed. Please install it.")

logger.info(f"Loading GLiNER model on device: {self.map_location}")
self.gliner = GLiNER.from_pretrained(self.model_name).to(self.map_location)
self.gliner = GLiNER.from_pretrained(
self.model_name, map_location=self.map_location
)

def analyze(
self,


+ 21
- 39
presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/azure_openai_langextract_recognizer.py View File

@@ -112,7 +112,13 @@ class AzureOpenAILangExtractRecognizer(LangExtractRecognizer):
super().__init__(
config_path=actual_config_path,
name="Azure OpenAI LangExtract PII",
supported_language=supported_language
supported_language=supported_language,
extract_params={
"extract": {
"fence_output": True,
"use_schema_constraints": False,
},
}
)

# Override model_id if provided as parameter (deployment name)
@@ -133,44 +139,20 @@ class AzureOpenAILangExtractRecognizer(LangExtractRecognizer):
f"See {AZURE_OPENAI_DOCS_URL} for details."
)

def _call_langextract(self, **kwargs):
"""
Call Azure OpenAI through LangExtract for PII extraction.

Uses LangExtract's provider registry system to instantiate the custom
Azure OpenAI provider. The model_id with 'azure:' prefix triggers the
provider registration.
"""
try:

model_id_with_prefix = f"azure:{self.model_id}"
def _get_provider_params(self):
"""Return Azure OpenAI-specific params."""
model_id_with_prefix = f"azure:{self.model_id}"

language_model_params = {
"azure_endpoint": self.azure_endpoint,
"api_version": self.api_version,
"azure_deployment": self.model_id,
}

if self.api_key:
language_model_params["api_key"] = self.api_key

extract_params = {
"text_or_documents": kwargs.pop("text"),
"prompt_description": kwargs.pop("prompt"),
"examples": kwargs.pop("examples"),
"model_id": model_id_with_prefix,
"language_model_params": language_model_params,
"fence_output": True,
"use_schema_constraints": False,
}
language_model_params = {
"azure_endpoint": self.azure_endpoint,
"api_version": self.api_version,
"azure_deployment": self.model_id,
}

extract_params.update(kwargs)
if self.api_key:
language_model_params["api_key"] = self.api_key

return lx.extract(**extract_params)

except Exception:
logger.exception(
"LangExtract extraction failed (Azure OpenAI at %s, model '%s')",
self.azure_endpoint, self.model_id
)
raise
return {
"model_id": model_id_with_prefix,
"language_model_params": language_model_params,
}

+ 54
- 4
presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/langextract_recognizer.py View File

@@ -1,6 +1,6 @@
import logging
from abc import ABC, abstractmethod
from typing import List
from typing import Any, Dict, List, Optional

from presidio_analyzer.llm_utils import (
check_langextract_available,
@@ -12,6 +12,7 @@ from presidio_analyzer.llm_utils import (
load_prompt_file,
load_yaml_examples,
load_yaml_file,
lx,
render_jinja_template,
validate_config_fields,
)
@@ -31,13 +32,16 @@ class LangExtractRecognizer(LMRecognizer, ABC):
self,
config_path: str,
name: str = "LangExtract LLM PII",
supported_language: str = "en"
supported_language: str = "en",
extract_params: Optional[Dict[str, Any]] = None,
):
"""Initialize LangExtract recognizer.

:param config_path: Path to configuration file.
:param name: Name of the recognizer (provided by subclass).
:param supported_language: Language this recognizer supports (default: "en").
:param extract_params: Dict with 'extract' and/or 'language_model'
keys containing param defaults.
"""
check_langextract_available()

@@ -102,6 +106,26 @@ class LangExtractRecognizer(LMRecognizer, ABC):

self.entity_mappings = langextract_config["entity_mappings"]
self.debug = langextract_config.get("debug", False)
self._model_config = model_config

# Process extract params with config override
self._extract_params = {}
self._language_model_params = {}

if extract_params:
if "extract" in extract_params:
for param_name, default_value in extract_params["extract"].items():
self._extract_params[param_name] = self._model_config.get(
param_name, default_value
)

if "language_model" in extract_params:
for param_name, default_value in (
extract_params["language_model"].items()
):
self._language_model_params[param_name] = (
self._model_config.get(param_name, default_value)
)

def _call_llm(self, text: str, entities: List[str], **kwargs):
"""Call LangExtract LLM."""
@@ -130,7 +154,33 @@ class LangExtractRecognizer(LMRecognizer, ABC):
recognizer_name=self.__class__.__name__
)

@abstractmethod
def _call_langextract(self, **kwargs):
"""Call provider-specific LangExtract implementation."""
"""Call LangExtract with configured parameters."""
try:
extract_params = {
"text_or_documents": kwargs.pop("text"),
"prompt_description": kwargs.pop("prompt"),
"examples": kwargs.pop("examples"),
}

extract_params.update(self._get_provider_params())
extract_params.update(self._extract_params)
if self._language_model_params:
extract_params["language_model_params"] = self._language_model_params
extract_params.update(kwargs)

return lx.extract(**extract_params)
except Exception:
logger.exception(
"LangExtract extraction failed (model '%s')",
self.model_id
)
raise

@abstractmethod
def _get_provider_params(self) -> Dict[str, Any]:
"""Return provider-specific params.

Examples: model_id, model_url, azure_endpoint, etc.
"""
...

+ 22
- 36
presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/ollama_langextract_recognizer.py View File

@@ -1,13 +1,13 @@
import logging
from pathlib import Path
from typing import Optional
from typing import Any, Dict, Optional

from presidio_analyzer.llm_utils import lx
from presidio_analyzer.predefined_recognizers.third_party.\
langextract_recognizer import LangExtractRecognizer

logger = logging.getLogger("presidio-analyzer")


class OllamaLangExtractRecognizer(LangExtractRecognizer):
"""LangExtract recognizer using Ollama backend."""

@@ -19,20 +19,9 @@ class OllamaLangExtractRecognizer(LangExtractRecognizer):
self,
config_path: Optional[str] = None,
supported_language: str = "en",
context: Optional[list] = None
context: Optional[list] = None,
):
"""Initialize Ollama LangExtract recognizer.

Note: Ollama server availability and model availability are not validated
during initialization. Any connectivity or model issues will be reported
when analyze() is first called.

:param config_path: Path to configuration file (optional).
:param supported_language: Language this recognizer supports
(optional, default: "en").
:param context: List of context words
(optional, currently not used by LLM recognizers).
"""
"""Initialize Ollama LangExtract recognizer."""
actual_config_path = (
config_path if config_path else str(self.DEFAULT_CONFIG_PATH)
)
@@ -40,7 +29,18 @@ class OllamaLangExtractRecognizer(LangExtractRecognizer):
super().__init__(
config_path=actual_config_path,
name="Ollama LangExtract PII",
supported_language=supported_language
supported_language=supported_language,
extract_params={
"extract": {
"use_schema_constraints": False,
"fence_output": False,
"max_char_buffer": 400,
},
"language_model": {
"timeout": 240,
"num_ctx": 8192,
}
}
)

model_config = self.config.get("model", {})
@@ -48,23 +48,9 @@ class OllamaLangExtractRecognizer(LangExtractRecognizer):
if not self.model_url:
raise ValueError("Ollama model configuration must contain 'model_url'")

def _call_langextract(self, **kwargs):
"""Call Ollama through LangExtract."""
try:
extract_params = {
"text_or_documents": kwargs.pop("text"),
"prompt_description": kwargs.pop("prompt"),
"examples": kwargs.pop("examples"),
"model_id": self.model_id,
"model_url": self.model_url,
}

extract_params.update(kwargs)

return lx.extract(**extract_params)
except Exception:
logger.exception(
"LangExtract extraction failed (Ollama at %s, model '%s')",
self.model_url, self.model_id
)
raise
def _get_provider_params(self) -> Dict[str, Any]:
"""Return Ollama-specific params."""
return {
"model_id": self.model_id,
"model_url": self.model_url,
}

+ 3
- 1
presidio-analyzer/pyproject.toml View File

@@ -29,10 +29,12 @@ dependencies = [
"pyyaml",
"phonenumbers (>=8.12,<10.0.0)",
"pydantic (>=2.0.0,<3.0.0)",
"cupy-cuda12x>=13.4.1",
]

[project.optional-dependencies]
gpu = [
"cupy-cuda12x>=13.4.1",
]
server = [
"flask (>=1.1)",
"gunicorn; platform_system != 'Windows'",


+ 130
- 0
presidio-analyzer/tests/test_azure_openai_langextract_recognizer.py View File

@@ -370,3 +370,133 @@ class TestAzureOpenAIProvider:
azure_endpoint="https://test.openai.azure.com/"
# No API key, so should try managed identity
)


class TestAzureOpenAILangExtractRecognizerParameterConfiguration:
"""Test parameter configuration with defaults and YAML overrides."""

def test_when_no_config_params_then_uses_defaults(self, mock_langextract, tmp_path):
"""Test that default extract params are used when not in config."""
import yaml
config = {
"lm_recognizer": {
"supported_entities": ["PERSON"],
},
"langextract": {
"prompt_file": "presidio-analyzer/presidio_analyzer/conf/langextract_prompts/default_pii_phi_prompt.j2",
"examples_file": "presidio-analyzer/presidio_analyzer/conf/langextract_prompts/default_pii_phi_examples.yaml",
"entity_mappings": {"person": "PERSON"},
"model": {
"model_id": "gpt-4o",
}
}
}
config_file = tmp_path / "test_config.yaml"
with open(config_file, 'w') as f:
yaml.dump(config, f)

recognizer = AzureOpenAILangExtractRecognizer(
config_path=str(config_file),
azure_endpoint="https://test.openai.azure.com/",
api_key="test-key"
)
# Verify Azure defaults are set (different from Ollama)
assert recognizer._extract_params["fence_output"] is True
assert recognizer._extract_params["use_schema_constraints"] is False

def test_when_config_has_params_then_overrides_defaults(self, mock_langextract, tmp_path):
"""Test that config values override defaults."""
import yaml
config = {
"lm_recognizer": {
"supported_entities": ["PERSON"],
},
"langextract": {
"prompt_file": "presidio-analyzer/presidio_analyzer/conf/langextract_prompts/default_pii_phi_prompt.j2",
"examples_file": "presidio-analyzer/presidio_analyzer/conf/langextract_prompts/default_pii_phi_examples.yaml",
"entity_mappings": {"person": "PERSON"},
"model": {
"model_id": "gpt-4o",
"fence_output": False, # Override default
"use_schema_constraints": True, # Override default
}
}
}
config_file = tmp_path / "test_config.yaml"
with open(config_file, 'w') as f:
yaml.dump(config, f)

recognizer = AzureOpenAILangExtractRecognizer(
config_path=str(config_file),
azure_endpoint="https://test.openai.azure.com/",
api_key="test-key"
)
# Verify config values override defaults
assert recognizer._extract_params["fence_output"] is False
assert recognizer._extract_params["use_schema_constraints"] is True

def test_when_analyze_called_then_params_passed_to_langextract(self, tmp_path):
"""Test that configured params are passed to langextract.extract()."""
import yaml
config = {
"lm_recognizer": {
"supported_entities": ["PERSON"],
},
"langextract": {
"prompt_file": "presidio-analyzer/presidio_analyzer/conf/langextract_prompts/default_pii_phi_prompt.j2",
"examples_file": "presidio-analyzer/presidio_analyzer/conf/langextract_prompts/default_pii_phi_examples.yaml",
"entity_mappings": {"person": "PERSON"},
"model": {
"model_id": "gpt-4o",
"fence_output": False,
}
}
}
config_file = tmp_path / "test_config.yaml"
with open(config_file, 'w') as f:
yaml.dump(config, f)

recognizer = AzureOpenAILangExtractRecognizer(
config_path=str(config_file),
azure_endpoint="https://test.openai.azure.com/",
api_key="test-key"
)

text = "My name is John Doe"
mock_extraction = MagicMock()
mock_extraction.extraction_class = "person"
mock_extraction.extraction_text = "John Doe"
mock_extraction.char_interval = MagicMock(start_pos=11, end_pos=19)
mock_extraction.alignment_status = "MATCH_EXACT"
mock_extraction.attributes = {}

mock_result = MagicMock()
mock_result.extractions = [mock_extraction]

with patch('langextract.extract', return_value=mock_result) as mock_extract:
recognizer.analyze(text)
# Verify extract was called
assert mock_extract.called
call_kwargs = mock_extract.call_args[1]
# Verify extract params were passed
assert call_kwargs["fence_output"] is False
assert call_kwargs["use_schema_constraints"] is False
# Verify Azure-specific provider params
assert call_kwargs["model_id"] == "azure:gpt-4o"
assert "language_model_params" in call_kwargs
assert call_kwargs["language_model_params"]["azure_endpoint"] == "https://test.openai.azure.com/"
assert call_kwargs["language_model_params"]["azure_deployment"] == "gpt-4o"
assert call_kwargs["language_model_params"]["api_key"] == "test-key"


+ 187
- 0
presidio-analyzer/tests/test_device_detector.py View File

@@ -0,0 +1,187 @@
"""Unit tests for DeviceDetector."""

from unittest.mock import MagicMock, patch

import pytest

from presidio_analyzer.nlp_engine.device_detector import DeviceDetector, device_detector


class TestDeviceDetectorErrorPaths:
"""Test suite for DeviceDetector error handling."""

def test_when_torch_import_fails_then_cpu_device(self):
"""Test that CPU is used when PyTorch import fails."""
with patch("builtins.__import__", side_effect=ImportError("No module named 'torch'")):
detector = DeviceDetector()
assert detector.get_device() == "cpu"
assert detector.get_gpu_device_name() is None

def test_when_cuda_not_available_then_cpu_device(self):
"""Test that CPU is used when CUDA is not available."""
mock_torch = MagicMock()
mock_torch.cuda.is_available.return_value = False
def mock_import(name, *args):
if name == "torch":
return mock_torch
return __builtins__.__import__(name, *args)
with patch("builtins.__import__", side_effect=mock_import):
detector = DeviceDetector()
assert detector.get_device() == "cpu"
assert detector.get_gpu_device_name() is None

def test_when_cuda_initialization_fails_then_fallback_to_cpu(self):
"""Test that CPU fallback occurs when CUDA initialization fails."""
mock_torch = MagicMock()
mock_torch.cuda.is_available.return_value = True
mock_torch.tensor.side_effect = RuntimeError("CUDA initialization error")
def mock_import(name, *args):
if name == "torch":
return mock_torch
return __builtins__.__import__(name, *args)
with patch("builtins.__import__", side_effect=mock_import):
detector = DeviceDetector()
assert detector.get_device() == "cpu"
assert detector.get_gpu_device_name() is None

def test_when_cuda_get_device_name_fails_then_fallback_to_cpu(self):
"""Test fallback when get_device_name fails."""
mock_torch = MagicMock()
mock_torch.cuda.is_available.return_value = True
mock_torch.tensor.return_value = MagicMock(__str__=lambda x: "tensor")
mock_torch.cuda.get_device_name.side_effect = RuntimeError("Device name error")
def mock_import(name, *args):
if name == "torch":
return mock_torch
return __builtins__.__import__(name, *args)
with patch("builtins.__import__", side_effect=mock_import):
detector = DeviceDetector()
assert detector.get_device() == "cpu"

def test_when_cuda_available_then_cuda_device(self):
"""Test successful CUDA detection."""
mock_torch = MagicMock()
mock_torch.cuda.is_available.return_value = True
mock_torch.tensor.return_value = MagicMock(__str__=lambda x: "tensor")
mock_torch.cuda.get_device_name.return_value = "Test GPU"
mock_torch.cuda.get_device_capability.return_value = (8, 0)
def mock_import(name, *args):
if name == "torch":
return mock_torch
return __builtins__.__import__(name, *args)
with patch("builtins.__import__", side_effect=mock_import):
detector = DeviceDetector()
assert detector.get_device() == "cuda"
assert detector.get_gpu_device_name() == "Test GPU"


class TestDeviceDetector:
"""Test suite for DeviceDetector functionality."""

def test_when_get_device_then_returns_string(self):
"""Test that get_device() returns a valid device string."""
detector = DeviceDetector()
device = detector.get_device()
assert isinstance(device, str)
assert device in ["cpu", "cuda"]

def test_when_get_gpu_device_name_then_returns_optional_string(self):
"""Test that get_gpu_device_name() returns None or string."""
detector = DeviceDetector()
device_name = detector.get_gpu_device_name()
assert device_name is None or isinstance(device_name, str)

def test_when_multiple_instances_then_same_values(self):
"""Test that multiple DeviceDetector instances have consistent values."""
detector1 = DeviceDetector()
detector2 = DeviceDetector()

# Both should return the same device
assert detector1.get_device() == detector2.get_device()
assert detector1.get_gpu_device_name() == detector2.get_gpu_device_name()


class TestDeviceDetectorIntegration:
"""Integration tests for DeviceDetector usage in NLP engines."""

def test_when_spacy_engine_loads_then_uses_device_detector(self):
"""Test that SpacyNlpEngine uses device_detector."""
from presidio_analyzer.nlp_engine import SpacyNlpEngine
engine = SpacyNlpEngine(
models=[{"lang_code": "en", "model_name": "en_core_web_sm"}]
)
# Verify device_detector is accessible
assert device_detector.get_device() in ["cpu", "cuda"]

def test_when_stanza_engine_initializes_then_sets_use_gpu(self):
"""Test that StanzaNlpEngine correctly sets use_gpu from device_detector."""
from presidio_analyzer.nlp_engine import StanzaNlpEngine
engine = StanzaNlpEngine(
models=[{"lang_code": "en", "model_name": "en"}]
)
# use_gpu should match device_detector
expected_use_gpu = device_detector.get_device() == "cuda"
assert engine.use_gpu == expected_use_gpu

def test_when_gliner_recognizer_initializes_then_uses_correct_device(self):
"""Test that GLiNERRecognizer uses device from device_detector."""
pytest.importorskip("gliner")
from presidio_analyzer.predefined_recognizers import GLiNERRecognizer
recognizer = GLiNERRecognizer()
# map_location should match device_detector.get_device()
assert recognizer.map_location == device_detector.get_device()

def test_when_stanza_engine_use_gpu_matches_device_detector(self):
"""Test that StanzaNlpEngine.use_gpu matches device_detector."""
from presidio_analyzer.nlp_engine import StanzaNlpEngine
engine = StanzaNlpEngine(
models=[{"lang_code": "en", "model_name": "en"}]
)
expected_use_gpu = device_detector.get_device() == "cuda"
assert engine.use_gpu == expected_use_gpu


class TestDeviceDetectorBehavior:
"""Test suite for DeviceDetector runtime behavior."""

def test_when_creating_new_instance_then_device_consistent(self):
"""Test that new instances have consistent device detection."""
detector1 = DeviceDetector()
detector2 = DeviceDetector()
# Both should detect the same device
assert detector1.get_device() == detector2.get_device()

def test_when_device_is_cuda_then_has_capabilities(self):
"""Test that CUDA device has expected capabilities."""
if device_detector.get_device() == "cuda":
# Should have a device name
assert device_detector.get_gpu_device_name() is not None
assert len(device_detector.get_gpu_device_name()) > 0
def test_when_device_is_cpu_then_no_gpu_name(self):
"""Test that CPU device has no GPU name."""
if device_detector.get_device() == "cpu":
assert device_detector.get_gpu_device_name() is None

+ 0
- 1
presidio-analyzer/tests/test_gliner_recognizer.py View File

@@ -16,7 +16,6 @@ def mock_gliner():

# Mock the GLiNER class and its methods
mock_gliner_instance = MagicMock()
# Make .to() return the same mock instance (for device placement)
mock_gliner_instance.to.return_value = mock_gliner_instance
# Mock the from_pretrained method to return the mock instance
with patch("gliner.GLiNER.from_pretrained", return_value=mock_gliner_instance):


+ 133
- 0
presidio-analyzer/tests/test_ollama_recognizer.py View File

@@ -430,3 +430,136 @@ class TestOllamaLangExtractRecognizerAnalyze:

# Unknown entity type should be skipped when consolidation is disabled
assert len(results) == 0


class TestOllamaLangExtractRecognizerParameterConfiguration:
"""Test parameter configuration with defaults and YAML overrides."""

def test_when_no_config_params_then_uses_defaults(self, tmp_path):
"""Test that default extract params are used when not in config."""
import yaml
config = create_test_config()
# No extract params in config - should use defaults
config_file = tmp_path / "test_config.yaml"
with open(config_file, 'w') as f:
yaml.dump(config, f)

with patch('presidio_analyzer.llm_utils.langextract_helper.lx',
return_value=Mock()):
from presidio_analyzer.predefined_recognizers.third_party.ollama_langextract_recognizer import OllamaLangExtractRecognizer
recognizer = OllamaLangExtractRecognizer(config_path=str(config_file))
# Verify defaults are set
assert recognizer._extract_params["max_char_buffer"] == 400
assert recognizer._extract_params["use_schema_constraints"] is False
assert recognizer._extract_params["fence_output"] is False
assert recognizer._language_model_params["timeout"] == 240
assert recognizer._language_model_params["num_ctx"] == 8192

def test_when_config_has_params_then_overrides_defaults(self, tmp_path):
"""Test that config values override defaults."""
import yaml
config = create_test_config()
# Add custom values to override defaults
config["langextract"]["model"]["max_char_buffer"] = 1000
config["langextract"]["model"]["use_schema_constraints"] = True
config["langextract"]["model"]["fence_output"] = True
config["langextract"]["model"]["timeout"] = 120
config["langextract"]["model"]["num_ctx"] = 4096
config_file = tmp_path / "test_config.yaml"
with open(config_file, 'w') as f:
yaml.dump(config, f)

with patch('presidio_analyzer.llm_utils.langextract_helper.lx',
return_value=Mock()):
from presidio_analyzer.predefined_recognizers.third_party.ollama_langextract_recognizer import OllamaLangExtractRecognizer
recognizer = OllamaLangExtractRecognizer(config_path=str(config_file))
# Verify config values override defaults
assert recognizer._extract_params["max_char_buffer"] == 1000
assert recognizer._extract_params["use_schema_constraints"] is True
assert recognizer._extract_params["fence_output"] is True
assert recognizer._language_model_params["timeout"] == 120
assert recognizer._language_model_params["num_ctx"] == 4096

def test_when_partial_config_params_then_uses_defaults_for_missing(self, tmp_path):
"""Test that only some params can be overridden."""
import yaml
config = create_test_config()
# Override only some params
config["langextract"]["model"]["max_char_buffer"] = 500
config["langextract"]["model"]["timeout"] = 60
config_file = tmp_path / "test_config.yaml"
with open(config_file, 'w') as f:
yaml.dump(config, f)

with patch('presidio_analyzer.llm_utils.langextract_helper.lx',
return_value=Mock()):
from presidio_analyzer.predefined_recognizers.third_party.ollama_langextract_recognizer import OllamaLangExtractRecognizer
recognizer = OllamaLangExtractRecognizer(config_path=str(config_file))
# Verify overridden values
assert recognizer._extract_params["max_char_buffer"] == 500
assert recognizer._language_model_params["timeout"] == 60
# Verify defaults for non-overridden params
assert recognizer._extract_params["use_schema_constraints"] is False
assert recognizer._extract_params["fence_output"] is False
assert recognizer._language_model_params["num_ctx"] == 8192

def test_when_analyze_called_then_params_passed_to_langextract(self, tmp_path):
"""Test that configured params are passed to langextract.extract()."""
import yaml
config = create_test_config()
config["langextract"]["model"]["max_char_buffer"] = 1500
config["langextract"]["model"]["timeout"] = 180
config_file = tmp_path / "test_config.yaml"
with open(config_file, 'w') as f:
yaml.dump(config, f)

with patch('presidio_analyzer.llm_utils.langextract_helper.lx',
return_value=Mock()):
from presidio_analyzer.predefined_recognizers.third_party.ollama_langextract_recognizer import OllamaLangExtractRecognizer
recognizer = OllamaLangExtractRecognizer(config_path=str(config_file))

text = "My name is John Doe"
mock_extraction = Mock()
mock_extraction.extraction_class = "person"
mock_extraction.extraction_text = "John Doe"
mock_extraction.char_interval = Mock(start_pos=11, end_pos=19)
mock_extraction.alignment_status = "MATCH_EXACT"
mock_extraction.attributes = {}

mock_result = Mock()
mock_result.extractions = [mock_extraction]

with patch('langextract.extract', return_value=mock_result) as mock_extract:
recognizer.analyze(text)
# Verify extract was called
assert mock_extract.called
call_kwargs = mock_extract.call_args[1]
# Verify extract params were passed
assert call_kwargs["max_char_buffer"] == 1500
assert call_kwargs["use_schema_constraints"] is False
assert call_kwargs["fence_output"] is False
# Verify language model params were passed
assert "language_model_params" in call_kwargs
assert call_kwargs["language_model_params"]["timeout"] == 180
assert call_kwargs["language_model_params"]["num_ctx"] == 8192
# Verify provider params
assert call_kwargs["model_id"] == "qwen2.5:1.5b"
assert call_kwargs["model_url"] == "http://localhost:11434"


+ 49
- 1
presidio-analyzer/tests/test_spacy_nlp_engine.py View File

@@ -1,5 +1,6 @@
import json
from typing import Iterator
from unittest.mock import MagicMock, patch

import pytest

@@ -102,4 +103,51 @@ def test_batch_processing_with_as_tuples_returns_context(spacy_nlp_engine, texts
else:
for text, nlp_artifacts in nlp_artifacts_batch:
assert text == "simple text"
assert len(nlp_artifacts.tokens) == 2
assert len(nlp_artifacts.tokens) == 2


def test_when_gpu_available_then_spacy_gpu_configured():
"""Test that spaCy GPU is configured when GPU is detected."""
with patch("presidio_analyzer.nlp_engine.spacy_nlp_engine.device_detector") as mock_detector:
mock_detector.get_device.return_value = "cuda"
with patch("presidio_analyzer.nlp_engine.spacy_nlp_engine.spacy") as mock_spacy:
mock_spacy.load.return_value = MagicMock()
mock_spacy.util.is_package.return_value = True
engine = SpacyNlpEngine(models=[{"lang_code": "en", "model_name": "en_core_web_sm"}])
engine.load()
mock_spacy.require_gpu.assert_called_once()


def test_when_gpu_configuration_fails_then_warning_logged():
"""Test that warning is logged when GPU configuration fails."""
with patch("presidio_analyzer.nlp_engine.spacy_nlp_engine.device_detector") as mock_detector:
mock_detector.get_device.return_value = "cuda"
with patch("presidio_analyzer.nlp_engine.spacy_nlp_engine.spacy") as mock_spacy:
mock_spacy.load.return_value = MagicMock()
mock_spacy.util.is_package.return_value = True
mock_spacy.require_gpu.side_effect = Exception("GPU error")
with patch("presidio_analyzer.nlp_engine.spacy_nlp_engine.logger") as mock_logger:
engine = SpacyNlpEngine(models=[{"lang_code": "en", "model_name": "en_core_web_sm"}])
engine.load()
assert mock_logger.warning.called


def test_when_cpu_device_then_gpu_not_configured():
"""Test that GPU is not configured when CPU device is detected."""
with patch("presidio_analyzer.nlp_engine.spacy_nlp_engine.device_detector") as mock_detector:
mock_detector.get_device.return_value = "cpu"
with patch("presidio_analyzer.nlp_engine.spacy_nlp_engine.spacy") as mock_spacy:
mock_spacy.load.return_value = MagicMock()
mock_spacy.util.is_package.return_value = True
engine = SpacyNlpEngine(models=[{"lang_code": "en", "model_name": "en_core_web_sm"}])
engine.load()
mock_spacy.require_gpu.assert_not_called()

+ 339
- 0
presidio-analyzer/tests/test_stanza_batch_processing.py View File

@@ -0,0 +1,339 @@
"""Unit tests for StanzaNlpEngine.process_batch() and bulk_process integration."""

from typing import Iterator
from unittest.mock import Mock, MagicMock, patch

import pytest

from presidio_analyzer.nlp_engine import NlpArtifacts


@pytest.fixture(scope="module")
def stanza_nlp_engine(nlp_engines):
"""Get the StanzaNlpEngine from the available engines."""
nlp_engine = nlp_engines.get("stanza_en", None)
if nlp_engine:
nlp_engine.load()
return nlp_engine


@pytest.mark.skip_engine("stanza_en")
class TestStanzaBatchProcessing:
"""Test suite for Stanza batch processing functionality."""

def test_when_process_batch_with_strings_then_returns_iterator(
self, stanza_nlp_engine
):
"""Test basic batch processing with simple strings."""
texts = ["Hello world", "This is a test"]
result = stanza_nlp_engine.process_batch(texts, language="en", batch_size=2)
assert isinstance(result, Iterator)
result_list = list(result)
assert len(result_list) == 2
for text, nlp_artifacts in result_list:
assert isinstance(text, str)
assert isinstance(nlp_artifacts, NlpArtifacts)
assert len(nlp_artifacts.tokens) > 0

def test_when_process_batch_with_tuples_then_returns_context(
self, stanza_nlp_engine
):
"""Test batch processing with tuples including context."""
texts = [
("Hello world", {"id": 1}),
("This is a test", {"id": 2})
]
result = stanza_nlp_engine.process_batch(
texts, language="en", batch_size=2, as_tuples=True
)
result_list = list(result)
assert len(result_list) == 2
text1, nlp_artifacts1, context1 = result_list[0]
assert text1 == "Hello world"
assert isinstance(nlp_artifacts1, NlpArtifacts)
assert context1 == {"id": 1}
text2, nlp_artifacts2, context2 = result_list[1]
assert text2 == "This is a test"
assert isinstance(nlp_artifacts2, NlpArtifacts)
assert context2 == {"id": 2}

def test_when_process_batch_with_entities_then_extracts_correctly(
self, stanza_nlp_engine
):
"""Test that batch processing correctly extracts entities."""
texts = [
"Barack Obama was born in Hawaii.",
"John Smith lives in New York."
]
result = stanza_nlp_engine.process_batch(texts, language="en", batch_size=2)
result_list = list(result)
# First text should have entities (Barack Obama, Hawaii)
text1, nlp_artifacts1 = result_list[0]
assert len(nlp_artifacts1.entities) >= 2
# Second text should have entities (John Smith, New York)
text2, nlp_artifacts2 = result_list[1]
assert len(nlp_artifacts2.entities) >= 2

def test_when_process_batch_with_different_batch_sizes_then_works(
self, stanza_nlp_engine
):
"""Test batch processing with various batch sizes."""
texts = ["Text one", "Text two", "Text three", "Text four", "Text five"]
for batch_size in [1, 2, 3, 10]:
result = stanza_nlp_engine.process_batch(
texts, language="en", batch_size=batch_size
)
result_list = list(result)
assert len(result_list) == 5
for text, nlp_artifacts in result_list:
assert isinstance(nlp_artifacts, NlpArtifacts)

def test_when_process_batch_with_empty_list_then_returns_empty(
self, stanza_nlp_engine
):
"""Test batch processing with empty input."""
texts = []
result = stanza_nlp_engine.process_batch(texts, language="en")
result_list = list(result)
assert len(result_list) == 0

def test_when_process_batch_not_loaded_then_raises_error(self):
"""Test that processing without loading raises an error."""
from presidio_analyzer.nlp_engine import StanzaNlpEngine
engine = StanzaNlpEngine(
models=[{"lang_code": "en", "model_name": "en"}]
)
# Don't call load()
with pytest.raises(ValueError, match="NLP engine is not loaded"):
list(engine.process_batch(["test"], language="en"))

def test_when_process_batch_with_whitespace_then_handles_correctly(
self, stanza_nlp_engine
):
"""Test batch processing with texts containing whitespace."""
texts = [
" Leading whitespace",
"Trailing whitespace ",
" Multiple spaces "
]
result = stanza_nlp_engine.process_batch(texts, language="en", batch_size=3)
result_list = list(result)
assert len(result_list) == 3
for text, nlp_artifacts in result_list:
assert isinstance(nlp_artifacts, NlpArtifacts)
# Should have tokens despite whitespace
assert len(nlp_artifacts.tokens) > 0

def test_when_process_batch_preserves_text_order(self, stanza_nlp_engine):
"""Test that batch processing preserves input order."""
texts = [f"Text number {i}" for i in range(10)]
result = stanza_nlp_engine.process_batch(texts, language="en", batch_size=3)
result_list = list(result)
for i, (text, nlp_artifacts) in enumerate(result_list):
assert text == f"Text number {i}"

def test_when_process_batch_with_special_chars_then_works(
self, stanza_nlp_engine
):
"""Test batch processing with special characters."""
texts = [
"Email: test@example.com",
"Phone: +1-555-1234",
"URL: https://example.com"
]
result = stanza_nlp_engine.process_batch(texts, language="en", batch_size=3)
result_list = list(result)
assert len(result_list) == 3
for text, nlp_artifacts in result_list:
assert isinstance(nlp_artifacts, NlpArtifacts)


@pytest.mark.skip_engine("stanza_en")
class TestStanzaTokenizerConvertDoc:
"""Test suite for StanzaTokenizer._convert_doc() method."""

def test_when_convert_doc_called_then_returns_spacy_doc(
self, stanza_nlp_engine
):
"""Test that _convert_doc() correctly converts Stanza docs to spaCy docs."""
import stanza
# Get the tokenizer
stanza_tokenizer = stanza_nlp_engine.nlp["en"].tokenizer
stanza_pipeline = stanza_tokenizer.snlp
# Process a text through Stanza
text = "Barack Obama was born in Hawaii."
stanza_doc = stanza.Document([], text=text)
processed_doc = stanza_pipeline(stanza_doc)
# Convert to spaCy doc
spacy_doc = stanza_tokenizer._convert_doc(processed_doc)
# Verify the conversion
assert spacy_doc.text == text
assert len(spacy_doc) > 0 # Has tokens
# Note: Sentence boundaries require the full pipeline
assert any(token.is_sent_start for token in spacy_doc) # Has sentence starts
assert len(spacy_doc.ents) > 0 # Has entities

def test_when_convert_doc_with_empty_text_then_returns_empty_doc(
self, stanza_nlp_engine
):
"""Test _convert_doc() with empty text."""
import stanza
from spacy.tokens import Doc
stanza_tokenizer = stanza_nlp_engine.nlp["en"].tokenizer
# Create empty Stanza doc
empty_doc = stanza.Document([], text="")
# Convert
spacy_doc = stanza_tokenizer._convert_doc(empty_doc)
assert isinstance(spacy_doc, Doc)
assert len(spacy_doc) == 0

def test_when_convert_doc_with_whitespace_only_then_handles_correctly(
self, stanza_nlp_engine
):
"""Test _convert_doc() with whitespace-only text."""
import stanza
from spacy.tokens import Doc
stanza_tokenizer = stanza_nlp_engine.nlp["en"].tokenizer
# Create whitespace-only Stanza doc
whitespace_doc = stanza.Document([], text=" ")
# Convert
spacy_doc = stanza_tokenizer._convert_doc(whitespace_doc)
assert isinstance(spacy_doc, Doc)
# Should handle whitespace gracefully

def test_when_convert_doc_preserves_linguistic_features(
self, stanza_nlp_engine
):
"""Test that _convert_doc() preserves POS tags, lemmas, and dependencies."""
import stanza
stanza_tokenizer = stanza_nlp_engine.nlp["en"].tokenizer
stanza_pipeline = stanza_tokenizer.snlp
text = "The quick brown fox jumps."
stanza_doc = stanza.Document([], text=text)
processed_doc = stanza_pipeline(stanza_doc)
spacy_doc = stanza_tokenizer._convert_doc(processed_doc)
# Verify linguistic features are preserved
for token in spacy_doc:
assert token.pos_ is not None # POS tags
assert token.lemma_ is not None # Lemmas
if token.dep_:
assert token.head is not None # Dependencies


@pytest.mark.skip_engine("stanza_en")
class TestStanzaBulkProcessIntegration:
"""Integration tests for Stanza's bulk_process usage."""

@patch("stanza.Pipeline.bulk_process")
def test_when_process_batch_then_calls_bulk_process(
self, mock_bulk_process, stanza_nlp_engine
):
"""Test that process_batch() calls Stanza's bulk_process method."""
import stanza
# Setup mock to return processed docs
mock_bulk_process.return_value = [
Mock(text="Text 1", sentences=[], entities=[]),
Mock(text="Text 2", sentences=[], entities=[])
]
# Create mock for the conversion
stanza_tokenizer = stanza_nlp_engine.nlp["en"].tokenizer
original_convert = stanza_tokenizer._convert_doc
def mock_convert(doc):
# Return a minimal spaCy doc
from spacy.tokens import Doc
return Doc(stanza_tokenizer.vocab, words=["test"])
stanza_tokenizer._convert_doc = mock_convert
try:
texts = ["Text 1", "Text 2"]
result = list(stanza_nlp_engine.process_batch(
texts, language="en", batch_size=2
))
# Verify bulk_process was called
assert mock_bulk_process.called
# Verify the input to bulk_process
call_args = mock_bulk_process.call_args[0][0]
assert len(call_args) == 2
assert all(isinstance(doc, stanza.Document) for doc in call_args)
finally:
# Restore original method
stanza_tokenizer._convert_doc = original_convert

def test_when_process_batch_with_large_batch_then_handles_correctly(
self, stanza_nlp_engine
):
"""Test batch processing with a large number of texts."""
num_texts = 100
texts = [f"This is test text number {i}." for i in range(num_texts)]
result = stanza_nlp_engine.process_batch(
texts, language="en", batch_size=16
)
result_list = list(result)
assert len(result_list) == num_texts
# Verify all texts were processed
for i, (text, nlp_artifacts) in enumerate(result_list):
assert f"number {i}" in text
assert isinstance(nlp_artifacts, NlpArtifacts)

def test_when_process_batch_batching_matches_batch_size(
self, stanza_nlp_engine
):
"""Test that internal batching respects batch_size parameter."""
texts = [f"Text {i}" for i in range(10)]
# Process with different batch sizes
for batch_size in [1, 3, 5, 10]:
result = stanza_nlp_engine.process_batch(
texts, language="en", batch_size=batch_size
)
result_list = list(result)
# Should process all texts regardless of batch size
assert len(result_list) == 10

Loading…
Cancel
Save
Baidu
map