#!/usr/bin/perl ## kwic.pl --- 12.6.2004 # Copyright (C) 2004 Virach Sornlertlamvanich ## Author: Virach SORNLERTLAMVANICH ## Keywords: KWIC, Thai text # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2, or (at # your option) any later version. # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. ## Commentary: KWIC for both segmented or unsegmented Thai text ## Usage: kwic.pl [-s] textfile [< keyfile] # Option: # default # plain input text and 'keyword;span' is acquired from # direct input (50 columns left-right of the keyword if # no span defined) # -s input text is a segmented text, having a space as the # word boundary # keyfile # list of 'keyword;span' ## Code: $nonspace = '[ิีึืุู่้๊๋ั็์ฺํ]'; $nonbegin = '[ะาำฯๆ]'; $nonend = '[โเแไใ]'; $boundary = '[ ^$\t\n]'; $filename = shift(@ARGV); if ($filename =~ /^-s$/) { $segment = 1; $filename = shift(@ARGV); } open(text,$filename) || die "Cannot open $filename\n"; while($str = ){ chop($str); ($key, $span) = split(/\;/,$str); $span = 50, if ($span==0); exit 0, if (length($key)==0); undef $foundpos; while () { chop; s/\t/ /g; $pos = $[; $line++; while (($pos = index($_, $key, $pos)) >= $[) { undef $lstr; undef $rstr; undef $lcol; undef $rcol; @chars = split(//, $_); # for segmented text (space between word) if ($segment) { if (($chars[$pos-1] !~ $boundary)|| ($chars[$pos+length($key)] !~ $boundary)){ $pos += length($key); next; } } for ($lpos = $pos-1; (($lpos >= 0)&&($lcol < $span));$lpos--) { $lstr = $chars[$lpos] . $lstr; if ($chars[$lpos] !~ $nonspace) { $lcol++; } } if ($lcol < $span) { while ($lcol < $span) { $lstr = " " . $lstr; $lcol++; } } for ($rpos = $pos + length($key); (($rpos < length($_))&&($rcol < $span));$rpos++) { $rstr = $rstr . $chars[$rpos]; if ($chars[$rpos] !~ $nonspace) { $rcol++; } } if ($rcol < $span) { while ($rcol < $span) { $rstr = $rstr . " "; $rcol++; } } $pos += length($key); $foundpos++; printf "[%5d][%s [%s] %s]\n", $line, $lstr, $key, $rstr; } } split(/\//,$filename); if ($foundpos>0) { print ">>>Found '$key' in $_[$#_] for $foundpos times.<<<\n\n"; } else { print ">>>'$key' does not exist in $_[$#_].<<<\n\n"; } $line=0; seek(text,0,0); }