use warnings; use strict; use Lingua::Stem; use Carp; # $data_file="D:/coding/d1.txt"; # open(DAT, $data_file) || die("Could not open file!"); # my @docs=; # close(DAT); open instream, "D:/coding/d1.txt"; $/=undef; # It is used read all text at once from the file. @docs = ; close instream ; my $engine = new( docs => \@docs, threshold => .04); #call new $engine->build_index(); #call build_index while ( my $query = <> ) { my %results = $engine->search( $query ); #call search print join "\n", keys %results; } # new sub new { my ( $class, %params ) = @_; croak 'Usage: new( docs => \@docs);' unless exists ( $params{'docs'} ) and ref( $params{'docs'} ) and ref( $params{'docs'}) eq 'ARRAY'; my $self = { docs => $params{'docs'}, threshold => $params{'threshold'} || .001, stop_list => load_stop_list(), }; return bless $self, $class; } # build index sub build_index() { my ( $self ) = @_; print "Making word list:\n"; $self->make_word_list(); my @vecs; foreach my $doc ( @{ $self->{'docs'} }) { my $vec = $self->make_vector( $doc ); push @vecs, norm $vec; } $self->{'doc_vectors'} = \@vecs; print "Finished with word list\n"; } #search sub search { my ( $self, $query ) = @_; my $qvec = $self->make_vector( $query ); my %result_list = $self->get_cosines( norm $qvec ); my %documents; foreach my $index ( keys %result_list ) { my $doc = $self->{'docs'}->[$index]; my $relevance = $result_list{$index}; $documents{$doc} = $relevance; } return %documents; } # load stop lists sub load_stop_list { my %stop_words; while () { chomp; $stop_words{$_}++; } return \%stop_words; } #make word list sub make_word_list { my ( $self ) = @_; my %all_words; foreach my $doc ( @{ $self->{docs} } ) { my %words = $self->get_words( $doc ); foreach my $k ( keys %words ) { #print "Word: $k\n"; $all_words{$k} += $words{$k}; } } # create a lookup hash of word to position my %lookup; my @sorted_words = sort keys %all_words; @lookup{@sorted_words} = (1..$#sorted_words ); $self->{'word_index'} = \%lookup; $self->{'word_list'} = \@sorted_words; $self->{'word_count'} = scalar @sorted_words; } #make vector sub make_vector { my ( $self, $doc ) = @_; my %words = $self->get_words( $doc ); my $vector = zeroes $self->{'word_count'}; foreach my $w ( keys %words ) { my $value = $words{$w}; my $offset = $self->{'word_index'}->{$w}; index( $vector, $offset ) .= $value; } return $vector; } # get cosines sub get_cosines { my ( $self, $query_vec ) = @_; my %cosines; my $index = 0; foreach my $vec ( @{ $self->{'doc_vectors'} }) { my $cosine = cosine( $vec, $query_vec ); $cosines{$index} = $cosine if $cosine > $self->{'threshold'}; $index++; } return %cosines; } # Assumes both incoming vectors are normalized sub cosine { my ( $vec1, $vec2 ) = @_; my $cos = inner( $vec1, $vec2 ); # inner product return $cos->sclr(); # converts PDL object to Perl scalar } # get words sub get_words { # Splits on whitespace and strips some punctuation my ( $self, $text ) = @_; my %doc_words; my @words = map { stem($_) } grep { !( exists $self->{'stop_list'}->{$_} ) } map { lc($_) } map { $_ =~/([a-z\-']+)/i} split /\s+/, $text; do { $_++ } for @doc_words{@words}; return %doc_words; } sub stem { my ( $word) = @_; my $stemref = Lingua::Stem::stem( $word ); return $stemref->[0]; }