Download code

Jump to: navigation, search

Back to POS_tagger_(Java)

Download for Windows: zip

Download for UNIX: zip, tar.gz, tar.bz2

AbstractHMM.java

 1 /* The authors of this work have released all rights to it and placed it
 2 in the public domain under the Creative Commons CC0 1.0 waiver
 3 (http://creativecommons.org/publicdomain/zero/1.0/).
 4 
 5 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 6 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 7 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 8 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 9 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
10 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
11 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
12 
13 Retrieved from: http://en.literateprograms.org/POS_tagger_(Java)?oldid=16733
14 */
15 
16 import java.util.Map;
17 
18 abstract class AbstractHMM {
19     protected String a;
20     protected int n;
21     protected String obsA;
22     protected String obs;
23     protected double[][] A;
24     protected double[][] B;
25     protected String mostProbableSequence;
26     public AbstractHMM(String a, String obsA, String obs,
27             Map<String, String> lexicon, String corpus) {
28         MatrixGenerator gen = new MatrixGenerator(lexicon, corpus, a);
29         this.a = a;
30         this.n = obs.length();
31         this.obsA = obsA;
32         this.obs = obs;
33         this.A = gen.createMatrixA(0.01);
34         this.B = gen.createMatrixB();
35     }
36 
37     public abstract String mostProbableSequence();
38 }


hijacker
hijacker
hijacker
hijacker

Tagger.java

 1 /* The authors of this work have released all rights to it and placed it
 2 in the public domain under the Creative Commons CC0 1.0 waiver
 3 (http://creativecommons.org/publicdomain/zero/1.0/).
 4 
 5 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 6 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 7 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 8 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 9 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
10 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
11 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
12 
13 Retrieved from: http://en.literateprograms.org/POS_tagger_(Java)?oldid=16733
14 */
15 
16 
17 import java.util.HashMap;
18 import java.util.Map;
19 import org.junit.Test;
20 
21 public class Tagger {
22 	String tagset;
23 	String corpus;
24 	Map<String, String> lexicon;
25 	String lexiconString;
26 	@Test
27 	public void testTagger() {
28 		
29 		System.out.println(tag("layers that fly that food . that fly sneaks ."));
30 	}
31 	
32 	private void initData() {
33 	    lexicon = new HashMap<String, String>();
34 	    lexicon.put("fly", "vn");
35 	    lexicon.put("layers", "vn");
36 	    lexicon.put("sneaks", "v");
37 	    lexicon.put("food", "n");
38 	    lexicon.put("that", "det,rp");
39 	    lexicon.put(".", "period");
40 	    StringBuffer lexBuf = new StringBuffer();
41 	    for (String word : lexicon.keySet()) {
42 	        lexBuf.append(word + " ");
43 	    }
44 	    lexiconString = lexBuf.toString();
45 	    tagset = "det rp vn v n period";
46 	    corpus = "that fly layers. that fly sneaks. that, that sneaks food" +
47 	    		"layers. that fly layers that food. layers.";
48 	}
49 
50 	
51 	public String tag(String input) {
52 	    initData();
53 	    ViterbiHMM hmm = new ViterbiHMM(tagset, lexiconString, input, lexicon,
54 	            corpus, true);
55 	    String[] in = input.split(" ");
56 	    String[] re = hmm.mostProbableSequence().split(" ");
57 	    StringBuilder buf = new StringBuilder();
58 	    for (int i = 0; i < in.length; i++) {
59 	        buf.append(in[i] + ":" + re[i] + " ");
60 	    }
61 	    return buf.toString();
62 	}
63 }


hijacker
hijacker
hijacker
hijacker

ViterbiMatrixTools.java

 1 /* The authors of this work have released all rights to it and placed it
 2 in the public domain under the Creative Commons CC0 1.0 waiver
 3 (http://creativecommons.org/publicdomain/zero/1.0/).
 4 
 5 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 6 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 7 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 8 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 9 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
10 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
11 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
12 
13 Retrieved from: http://en.literateprograms.org/POS_tagger_(Java)?oldid=16733
14 */
15 
16 import java.text.NumberFormat;
17 
18 public class ViterbiMatrixTools {
19 	
20 	static double sumForCol(int i, double[][] matrix) {
21 	    double sum = 0;
22 	    for (int j = 0; j < matrix.length; j++) {
23 	        sum += matrix[j][i];
24 	    }
25 	    return sum;
26 	}
27 	
28 	static double sumForRow(int i, double[][] matrix) {
29 	    double sum = 0;
30 	    for (int j = 0; j < matrix[i].length; j++) {
31 	        sum += matrix[i][j];
32 	    }
33 	    return sum;
34 	}
35 	
36 	static double maximimumForCol(int i, double[][] matrix) {
37 	    double maxValue = 0.0;
38 	    for (int j = 0; j < matrix.length; j++) {
39 	        maxValue = Math.max(maxValue, matrix[j][i]);
40 	    }
41 	    return maxValue;
42 	}
43 	
44 	static int indexOfMaximimumForCol(int i, double[][] matrix) {
45 	    int maxIndex = -1;
46 	    double maxValue = -1.0;
47 	    for (int j = 0; j < matrix.length; j++) {
48 	        if (matrix[j][i] > maxValue) {
49 	            maxIndex = j;
50 	            maxValue = matrix[j][i];
51 	        }
52 	    }
53 	    return maxIndex;
54 	}
55 	
56 	static int indexOfMaximimumForCol(int i, int[][] matrix) {
57 	    int maxIndex = -1;
58 	    int maxValue = -1;
59 	    for (int j = 0; j < matrix.length; j++) {
60 	        if (matrix[j][i] > maxValue) {
61 	            maxIndex = j;
62 	            maxValue = matrix[j][i];
63 	        }
64 	    }
65 	    return maxIndex;
66 	}
67 	
68 	static void printMatrix(double[][] matrix) {
69 	    for (int i = 0; i < matrix.length; i++) {
70 	        for (int j = 0; j < matrix[i].length; j++) {
71 	            String myString = NumberFormat.getInstance().format(
72 	                    matrix[i][j]);
73 	            if (myString.length() < 5) {
74 	                for (int k = myString.length(); k < 5; k++) {
75 	                    myString += " ";
76 	                }
77 	            }
78 	            System.out.print(myString + "   ");
79 	        }
80 	        System.out.println();
81 	    }
82 	}
83 	
84 	static void printMatrix(int[][] matrix) {
85 	    for (int i = 0; i < matrix.length; i++) {
86 	        for (int j = 0; j < matrix[i].length; j++) {
87 	            if (matrix[i][j] >= 0)
88 	                System.out.print(" ");
89 	            System.out.print(matrix[i][j] + "   ");
90 	        }
91 	        System.out.println();
92 	    }
93 	}
94 }
95 


hijacker
hijacker
hijacker
hijacker

MatrixGenerator.java

  1 /* The authors of this work have released all rights to it and placed it
  2 in the public domain under the Creative Commons CC0 1.0 waiver
  3 (http://creativecommons.org/publicdomain/zero/1.0/).
  4 
  5 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  6 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  7 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  8 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  9 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 10 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 11 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 12 
 13 Retrieved from: http://en.literateprograms.org/POS_tagger_(Java)?oldid=16733
 14 */
 15 
 16 
 17 import java.util.Arrays;
 18 import java.util.HashMap;
 19 import java.util.Iterator;
 20 import java.util.Map;
 21 import java.util.Random;
 22 import java.util.StringTokenizer;
 23 
 24 public class MatrixGenerator {
 25 	private Map<String, String> lexicon;
 26 	private Map<String, Integer> freq;
 27 	private String corpus;
 28 	private double[][] result;
 29 	private String tagset;
 30 	public MatrixGenerator(Map<String, String> lexicon, String corpus, String tagset) {
 31 		this.corpus = corpus;
 32 		this.lexicon = lexicon;
 33 		this.tagset = tagset;
 34 		countLexems();
 35 		System.out.println("Tagset: " + tagset);
 36 		System.out.println();
 37 		System.out.println("Corpus: " + corpus);
 38 		System.out.println();
 39 		System.out.println("Frequencies:");
 40 		System.out.println();
 41 		printMap(freq);
 42 	}
 43 	public double[][] createMatrixB() {
 44 		String[] t = tagset.split(" ");
 45 		String[] l = lexicon.keySet.toArray(new String[0]);
 46 		
 47 		result = new double[l.length][t.length];
 48 		for (int j = 0; j < result.length; j++) {
 49 		    Arrays.fill(result[j], 0.0);
 50 		    for (int k = 0; k < result[j].length; k++) {
 51 		        String categories = lexicon.get(l[j]);
 52 		        if (categories.contains(t[k])) {
 53 		            result[j][k] = 1.0 / categories.split(",").length;
 54 		        }
 55 		    }
 56 		}
 57 		
 58 		for (int j = 0; j < result.length; j++) {
 59 		    for (int k = 0; k < result[j].length; k++) {
 60 		        double a = result[j][k] * freq.get(l[j]);
 61 		        double b = 0.0;
 62 		        for (int index = 0; index < result.length; index++) {
 63 		            b += result[index][k] * freq.get(l[index]);
 64 		        }
 65 		        result[j][k] = a / b;
 66 		    }
 67 		}
 68 		System.out.println();
 69 		System.out
 70 		        .println("Matrix B (word-tag, lexicon and tags ordered as printed above, learned from corpus):");
 71 		ViterbiMatrixTools.printMatrix(result);
 72 		return result;
 73 	}
 74 	
 75 	public double[][] createMatrixA(double diff) {
 76 	    String[] t = tagset.split(" ");
 77 	    double[][] res = new double[t.length][t.length];
 78 	    int cols = res[0].length;
 79 	    double average = 1.0 / cols;
 80 	    for (int i = 0; i < res.length; i++) {
 81 	        Arrays.fill(res[i], average);
 82 	    }
 83 	    Random r = new Random();
 84 	    for (int i = 0; i < res.length; i++) {
 85 	        double var = 1.0;
 86 	        while (var > diff)
 87 	            var = r.nextDouble();
 88 	        int pos = r.nextInt(res[i].length);
 89 	        boolean plus = true;
 90 	        for (int j = 0; j < res[i].length; j++) {
 91 	            // if not even, skip one random position
 92 	            if (res[i].length % 2 != 0 && j == pos)
 93 	                continue;
 94 	            if (plus && res[i][j] - var > 0) {
 95 	                res[i][j] += var;
 96 	                plus = false;
 97 	            } else if (res[i][j] - var > 0) {
 98 	                res[i][j] -= var;
 99 	                plus = true;
100 	            }
101 	        }
102 	    }
103 	    System.out.println();
104 	    System.out.println("Matrix A (tag-tag, pseudo-random-generated):");
105 	    ViterbiMatrixTools.printMatrix(res);
106 	    System.out.println();
107 	    return res;
108 	}
109 	
110 	private void printMap(Map<String, ?> map) {
111 	    for (Map<String, ?> e : map.entrySet()) {
112 	        System.out.println(e.getKey() + ": " + e.getValue());
113 	    }
114 	}
115 	
116 	private void countLexems() {
117 	    freq = new HashMap<String, Integer>();
118 	    for (StringTokenizer tok = new StringTokenizer(corpus, " \t\n.,", true);
119 	         tok.hasMoreTokens(); ) {
120 	        String rec = tok.nextToken();
121 	        if (lexicon.containsKey(rec)) {
122 	            if (freq.containsKey(rec)) {
123 	                freq.put(rec, freq.get(rec) + 1);
124 	            } else {
125 	                freq.put(rec, 1);
126 	            }
127 	        }
128 	    }
129 	}
130 }


hijacker
hijacker
hijacker
hijacker

ViterbiHMM.java

  1 /* The authors of this work have released all rights to it and placed it
  2 in the public domain under the Creative Commons CC0 1.0 waiver
  3 (http://creativecommons.org/publicdomain/zero/1.0/).
  4 
  5 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  6 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  7 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  8 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  9 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 10 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 11 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 12 
 13 Retrieved from: http://en.literateprograms.org/POS_tagger_(Java)?oldid=16733
 14 */
 15 
 16 import java.util.Arrays;
 17 import java.util.Map;
 18 
 19 public class ViterbiHMM extends AbstractHMM {
 20     private double[][] delta;
 21     private int[][] psi;
 22     private String[] hiddenAlphabet;
 23     private String[] observableAlphabet;
 24     private String[] observation;
 25     private boolean absoluteValues;
 26     public ViterbiHMM(String a, String obsA, String obs,
 27             Map<String, String> lexicon, String corpus, boolean absoluteValues) {
 28         super(a, obsA, ". " + obs, lexicon, corpus);
 29         this.absoluteValues = absoluteValues;
 30     }
 31     public String mostProbableSequence() {
 32         
 33 	init();
 34 	induct();
 35         System.out.println("Delta:");
 36         ViterbiMatrixTools.printMatrix(delta);
 37         System.out.println();
 38         System.out.println("Psi:");
 39         ViterbiMatrixTools.printMatrix(psi);
 40         System.out.println();
 41         return getResult();
 42     }
 43     
 44     private void init() {
 45         hiddenAlphabet = a.split("\\s");
 46         observableAlphabet = obsA.split("\\s");
 47         observation = obs.split("\\s");
 48         delta = new double[hiddenAlphabet.length][observation.length];
 49         delta[delta.length - 1][0] = 1.0;
 50         psi = new int[hiddenAlphabet.length][observation.length - 1];
 51         for (int i = 0; i < psi.length; i++) {
 52             Arrays.fill(psi[i], -1);
 53         }
 54     }
 55     
 56     private void induct() {
 57         // (1)
 58         for (int i = 1; i < observation.length; i++) {
 59             // (2)
 60             for (int j = 0; j < hiddenAlphabet.length; j++) {
 61                 double prevTagMax = ViterbiMatrixTools.maximimumForCol(
 62                         i - 1, delta);
 63                 // (3)
 64                 int lexIndex = getIndex(observation[i], observableAlphabet);
 65                 // (4)
 66                 double prob = probForWordToTag(lexIndex + 1, j, B, A);
 67                 double res = prevTagMax * prob;
 68                 delta[j][i] = res;
 69                 if (res > 0.0)
 70                     psi[j][i - 1] = ViterbiMatrixTools.indexOfMaximimumForCol(
 71                             i - 1, delta);
 72             }
 73 
 74         }
 75     }
 76     
 77     private double probForWordToTag(int i, int j, double[][] b, double[][] a) {
 78         // delta has the leading period, therefore - 1 for previous:
 79         int prevIndex = ViterbiMatrixTools.indexOfMaximimumForCol(i - 1, delta);
 80         // b doesn't have the leading p, therefore -1 for recent:
 81         if (absoluteValues)
 82             return (b[i - 1][j] / ViterbiMatrixTools.sumForCol(j, B))
 83                     * (A[prevIndex][j] / ViterbiMatrixTools.sumForRow(
 84                             prevIndex, A));
 85         else {
 86             return b[i - 1][j] * A[prevIndex][j];
 87         }
 88     }
 89     
 90     private String getResult() {
 91         String[] resultArray = new String[psi[0].length];
 92         int lastIndexInPsi = ViterbiMatrixTools.indexOfMaximimumForCol(
 93                 delta[0].length - 1, delta);
 94         if (lastIndexInPsi == -1) {
 95             System.out.println("no tag-sequence found for input, exit.");
 96             System.exit(0);
 97         }
 98         int lastValueInPsi = psi[lastIndexInPsi][psi[0].length - 1];
 99         String lastTag = hiddenAlphabet[lastIndexInPsi];
100         resultArray[resultArray.length - 1] = lastTag;
101         // retrieve other tags:
102         for (int i = psi[0].length - 2; i >= 0; i--) {
103             resultArray[i] = hiddenAlphabet[lastValueInPsi];
104             lastValueInPsi = psi[lastValueInPsi][i];
105         }
106         StringBuffer resultString = new StringBuffer();
107         for (int i = 0; i < resultArray.length; i++) {
108             resultString.append(resultArray[i]);
109             if (i < resultArray.length - 1)
110                 resultString.append(" ");
111         }
112         return resultString.toString();
113     }
114     
115     int getIndex(String string, String[] lexicon) {
116         for (int i = 0; i < lexicon.length; i++) {
117             if (string.equals(lexicon[i]))
118                 return i;
119         }
120         System.out.println("Word '" + string + "' not found in lexicon, exit.");
121         System.exit(0);
122         return -1;
123     }
124 }


hijacker
hijacker
hijacker
hijacker