package lingscope.algorithms;
import java.util.HashSet; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import lingscope.structures.AnnotatedSentence;
/**
* * @author shashank */
public class BaselineScopeAnnotator extends BaselineAnnotator {
protected Pattern endPattern; public BaselineScopeAnnotator(String beginTag, String interTag, String otherTag, boolean commaLimit, boolean periodLimit) { super(beginTag, interTag, otherTag); String endPatternString = ""; if (commaLimit && periodLimit) { endPatternString = ",|\\."; } else if (commaLimit) { endPatternString = ","; } else if (periodLimit) { endPatternString = "\\."; } endPattern = Pattern.compile(endPatternString); // End tagging if a comma or period is seen } @Override public AnnotatedSentence annotateSentence(String sentence, boolean isTokenized) { if (phrases == null) { throw new RuntimeException("Annotator has not been loaded"); } if (!isTokenized) { sentence = AbnerTokenizer.splitTermsByPunctuation(sentence); } String lcSentence = sentence.toLowerCase(); String[] words = sentence.split(" +"); int wordsLength = words.length; Set<Integer> addBTag = new HashSet<Integer>(); // Collect all indices where beginning and intermediate tags should // be added for (String phrase : phrases) { if (!lcSentence.contains(phrase)) { continue; } String[] phraseWords = phrase.split(" +"); for (int wordCounter = 0; wordCounter < wordsLength; ++wordCounter) { String word = words[wordCounter]; if (word.equalsIgnoreCase(phraseWords[0])) { boolean phraseMatches = true; for (int j = 0; j < phraseWords.length; ++j) { int i = j + wordCounter; if (i == wordsLength) { phraseMatches = false; break; } if (!phraseWords[j].equalsIgnoreCase(words[i])) { phraseMatches = false; break; } } if (phraseMatches) { addBTag.add(wordCounter); } } } } // Create a tagged sentence. Give preference to beginning tag over // intermediate tag in case they clash StringBuilder taggedSentence = new StringBuilder(); boolean taggerOn = false; for (int i = 0; i < wordsLength; ++i) { String word = words[i]; if (addBTag.contains(i)) { taggedSentence.append(" ").append(word).append("|").append(beginTag); taggerOn = true; } else if (taggerOn) { Matcher endMatch = endPattern.matcher(word); if (endMatch.matches()) { taggedSentence.append(" ").append(word).append("|").append(otherTag); taggerOn = false; } else { taggedSentence.append(" ").append(word).append("|").append(interTag); } } else { taggedSentence.append(" ").append(word).append("|").append(otherTag); } } String raw = taggedSentence.substring(1); return new AnnotatedSentence(raw); }
}