#!/usr/bin/perl use strict; use warnings; my $VERSION = 1.0; use Text::CSV; my $csv = Text::CSV->new ( { binary => 1 } ) or die "Cannot use CSV: ".Text::CSV->error_diag (); use Lingua::DxExtractor; use Cwd; my $dir = getcwd; my ($active_section,@target,@skip,@absolute_positive,@absolute_negative,$start_phrase); open my $config, $dir . '/config.txt' or die $dir. '/config.txt' . " $!"; while ( my $row = $csv->getline( $config ) ) { next unless @$row[0]; if ( @$row[0] =~ /^#(.*)\Z/ ) { $active_section = $1; next; } @$row[0] =~ s/\s+$//; if ( $active_section eq 'target_phrases' ) { push @target, @$row[0]; } elsif ( $active_section eq 'skip_phrases' ) { push @skip, @$row[0]; } elsif ( $active_section eq 'absolute_negative_assertions' ) { push @absolute_negative, @$row[0]; } elsif ( $active_section eq 'absolute_positive_assertions' ) { push @absolute_positive, @$row[0]; } elsif ( $active_section eq 'start_phrase' ) { $start_phrase = @$row[0]; } } close $config; my $extractor = Lingua::DxExtractor->new( { target_phrases => \@target, skip_phrases => \@skip, absolute_present_phrases => \@absolute_positive, absolute_negative_phrases => \@absolute_negative, start_phrase => $start_phrase, } ); opendir DIR, $dir or die "Can't open directory $dir: $!\n"; while (my $file = readdir(DIR )) { next unless $file =~ /csv/; open my $fh, "$dir/$file" or die "$dir$file: $!"; $file =~ s/\.csv//; $file = lc($file); open OUTFILE, ">result_$file.csv"; my $row_count = 0; while ( my $row = $csv->getline( $fh ) ) { if ( $row_count++ == 0 ) { print OUTFILE (join ', ', @$row) . ", Outcome, Ambiguous?, Debug\n"; } else { my $answer = $extractor->process_text( @$row[0] ); my $ambiguous = $extractor->ambiguous || 0; my $debug = $extractor->debug; print OUTFILE ( join ',', map { qq{"$_"} } @$row, $answer, $ambiguous, $debug ) . qq{\n}; } $extractor->reset; } close OUTFILE; close $fh; } closedir DIR; =head1 NAME simpleNLP - a script that reads through medical reports and flags the presenece or absence of a condition in each report. =head1 DESCRIPTION Reads a config.txt file in the same directory to define a Lingua::DxExtractor object then reads a csv file in the same directory to parse reports and assign outcomes using the DxExtractor object. =head1 README Place this file in a folder with a config.txt and a datafile in csv format with column names in the first row and the fulltext reports that need to be parsed in the first column. Run the script and a new csv file will be generated with 3 new columns added for each row with an outcome, ambiguous flag and a debug section. The config.txt file should contain a line with #target_phrases followed by phrases (one per line). If needed sections can be added for #skip_phrases, #absolute_negative_assertions, #absolute_positive_assertions, and #start_phrase. =head1 PREREQUISITES This script requires the C and C modules. It also requires C which in turn requires C. =head1 COREQUISITES =pod OSNAMES any =pod SCRIPT CATEGORIES Fun/Educational =cut