/*
* To change this template, choose Tools | Templates * and open the template in the editor. */
package lingscope.algorithms;
import abner.Scanner; import java.io.StringReader;
/**
* * @author shashank */
public class AbnerTokenizer {
//////////////////////////////////////////////////////////////// /** <p>Take raw text apply ABNER's built-in tokenization on it. */ public static String tokenize(String s) { StringBuffer sb = new StringBuffer(); try { Scanner scanner = new Scanner(new StringReader(s)); String t; while ((t = scanner.nextToken()) != null) { sb.append(t+" "); if (t.toString().matches("[?!\\.]")) sb.append("\n"); } return sb.toString(); } catch (Exception e) { System.err.println(e); } return sb.toString(); } /** * Takes an input and splits the sentence by punctuations and spaces, then * stitches it back together with a space and returns * @param input the input string to process * @return processed input string, where all words and punctuations are * seperated by space */ public static String splitTermsByPunctuation(String input) { if (input.isEmpty()) { return ""; } input = input.replaceAll("\\n", " "); String ret = tokenize(input).trim(); if (ret.matches(".*\\w\\.$")) { // If a space is not put between the period in the end, then introduce one ret += " ."; } if (input.endsWith(".") && (!ret.endsWith("."))) { ret += " ."; } return ret; }
}