package lingscope.algorithms;
import generalutils.FileOperations; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger;
/**
* The baseline annotator * @author shashank */
public abstract class BaselineAnnotator extends Annotator {
protected Set<String> phrases; public BaselineAnnotator(String beginTag, String interTag, String otherTag) { super(beginTag, interTag, otherTag); phrases = null; } @Override public void serializeAnnotator(String trainingFile, String modelFile) { try { phrases = new HashSet<String>(); List<String> taggedSentences = FileOperations.readFile(trainingFile); for (String taggedSentence : taggedSentences) { phrases.addAll(getCueWords(taggedSentence, beginTag, interTag, otherTag)); } FileOperations.writeFile(modelFile, new ArrayList<String>(phrases)); } catch (Exception ex) { Logger.getLogger(BaselineAnnotator.class.getName()).log(Level.SEVERE, null, ex); } } @Override public void loadAnnotator(String modelFile) { try { phrases = new HashSet<String>(); phrases.addAll(FileOperations.readFile(modelFile)); } catch (Exception ex) { Logger.getLogger(BaselineAnnotator.class.getName()).log(Level.SEVERE, null, ex); } } /** * Gets the set of cue word phrases in the given sentence. The given * sentence is tagged according to Abner's specifications * @param abnerTaggedSentence sentence tagged by abner's specification. * @param beginTag the tag to mark the beginning of the cue * @param intermediateTag the tag to mark intermediate portions * @param otherTag the other tag * @return the set of cue words or phrases in the given sentence */ public static Set<String> getCueWords(String abnerTaggedSentence, String beginTag, String intermediateTag, String otherTag) { Set<String> cueWordsPhrases = new HashSet<String>(1); String[] elements = abnerTaggedSentence.split(" +"); boolean collect = false; StringBuilder collectedPhrase = new StringBuilder(); for (String element : elements) { String[] elementTokens = element.split("\\|"); String word = elementTokens[0]; String tag = elementTokens[1]; if (tag.equalsIgnoreCase(beginTag)) { collect = true; collectedPhrase.append(word).append(" "); } else if (tag.equalsIgnoreCase(intermediateTag)) { collectedPhrase.append(word).append(" "); } else if (tag.equalsIgnoreCase(otherTag) && collect) { collect = false; cueWordsPhrases.add(collectedPhrase.toString().trim().toLowerCase()); collectedPhrase.delete(0, collectedPhrase.length() - 1); } } return cueWordsPhrases; }
}