package lingscope.drivers;
import generalutils.FileOperations; import java.util.ArrayList; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import lingscope.io.AnnotatedSentencesIO; import lingscope.structures.AnnotatedSentence;
/**
* Merges two files, one containing pos tags and the other containing * annotated cues * @author shashank */
public class CueAndPosFilesMerger {
public static AnnotatedSentence merge(AnnotatedSentence cueSentence, String posSentence, boolean replaceTags) { String[] posTags = posSentence.split("\\s+"); List<String> crfTags = cueSentence.getTags(); List<String> words = cueSentence.getWords(); List<Boolean> tagStatusList = cueSentence.getIsAnnotatedTags(); StringBuilder mergedSentence = new StringBuilder(); int numWords = posTags.length; for (int j = 0; j < numWords; ++j) { mergedSentence.append(" "); String posTag = posTags[j]; String word = words.get(j); String crfTag = crfTags.get(j); boolean tagStatus = tagStatusList.get(j); if (tagStatus) { if (replaceTags) { mergedSentence.append("CUE|"); } else { mergedSentence.append(word).append("|"); } } else { mergedSentence.append(posTag).append("|"); } mergedSentence.append(crfTag); } AnnotatedSentence mergedAnnotatedSentence = new AnnotatedSentence(mergedSentence.substring(1)); return mergedAnnotatedSentence; } /** * Merges the cueSentences and posSentences * @param cueSentences * @param posSentences * @param replaceTags * @return */ public static List<AnnotatedSentence> merge(List<AnnotatedSentence> cueSentences, List<String> posSentences, boolean replaceTags) { List<AnnotatedSentence> mergedSentences = new ArrayList<AnnotatedSentence>(cueSentences.size()); int numSentences = posSentences.size(); for (int i = 0; i < numSentences; ++i) { AnnotatedSentence cueSentence = cueSentences.get(i); String posSentence = posSentences.get(i); AnnotatedSentence mergedAnnotatedSentence = merge(cueSentence, posSentence, replaceTags); mergedSentences.add(mergedAnnotatedSentence); } return mergedSentences; } /** * * @param args * 0 - cue input file * 1 - pos input file * 2 - replace cue with custom tag 'CUE' (true) or leave it as it is (false) * 3 - merged file output path */ public static void main(String[] args) { boolean replaceTags = Boolean.parseBoolean(args[2]); List<AnnotatedSentence> cueSentences = AnnotatedSentencesIO.read(args[0]); List<String> posSentences = null; try { posSentences = FileOperations.readFile(args[1]); } catch (Exception ex) { Logger.getLogger(CueAndPosFilesMerger.class.getName()).log(Level.SEVERE, null, ex); System.exit(1); } AnnotatedSentencesIO.write(args[3], merge(cueSentences, posSentences, replaceTags)); }
}