Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » nl » [javadoc | source]
    1   package org.apache.lucene.analysis.nl;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import java.util.Map;
   21   
   22   /**
   23    * A stemmer for Dutch words. 
   24    * <p>
   25    * The algorithm is an implementation of
   26    * the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
   27    * algorithm in Martin Porter's snowball project.
   28    * </p>
   29    */
   30   
   31   public class DutchStemmer {
   32     /**
   33      * Buffer for the terms while stemming them.
   34      */
   35     private StringBuilder sb = new StringBuilder();
   36     private boolean _removedE;
   37     private Map _stemDict;
   38   
   39     private int _R1;
   40     private int _R2;
   41   
   42     //TODO convert to internal
   43     /*
   44      * Stems the given term to an unique <tt>discriminator</tt>.
   45      *
   46      * @param term The term that should be stemmed.
   47      * @return Discriminator for <tt>term</tt>
   48      */
   49     public String stem(String term) {
   50       term = term.toLowerCase();
   51       if (!isStemmable(term))
   52         return term;
   53       if (_stemDict != null && _stemDict.containsKey(term))
   54         if (_stemDict.get(term) instanceof String)
   55           return (String) _stemDict.get(term);
   56         else
   57           return null;
   58   
   59       // Reset the StringBuilder.
   60       sb.delete(0, sb.length());
   61       sb.insert(0, term);
   62       // Stemming starts here...
   63       substitute(sb);
   64       storeYandI(sb);
   65       _R1 = getRIndex(sb, 0);
   66       _R1 = Math.max(3, _R1);
   67       step1(sb);
   68       step2(sb);
   69       _R2 = getRIndex(sb, _R1);
   70       step3a(sb);
   71       step3b(sb);
   72       step4(sb);
   73       reStoreYandI(sb);
   74       return sb.toString();
   75     }
   76   
   77     private boolean enEnding(StringBuilder sb) {
   78       String[] enend = new String[]{"ene", "en"};
   79       for (int i = 0; i < enend.length; i++) {
   80         String end = enend[i];
   81         String s = sb.toString();
   82         int index = s.length() - end.length();
   83         if (s.endsWith(end) &&
   84             index >= _R1 &&
   85             isValidEnEnding(sb, index - 1)
   86         ) {
   87           sb.delete(index, index + end.length());
   88           unDouble(sb, index);
   89           return true;
   90         }
   91       }
   92       return false;
   93     }
   94   
   95   
   96     private void step1(StringBuilder sb) {
   97       if (_R1 >= sb.length())
   98         return;
   99   
  100       String s = sb.toString();
  101       int lengthR1 = sb.length() - _R1;
  102       int index;
  103   
  104       if (s.endsWith("heden")) {
  105         sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
  106         return;
  107       }
  108   
  109       if (enEnding(sb))
  110         return;
  111   
  112       if (s.endsWith("se") &&
  113           (index = s.length() - 2) >= _R1 &&
  114           isValidSEnding(sb, index - 1)
  115       ) {
  116         sb.delete(index, index + 2);
  117         return;
  118       }
  119       if (s.endsWith("s") &&
  120           (index = s.length() - 1) >= _R1 &&
  121           isValidSEnding(sb, index - 1)) {
  122         sb.delete(index, index + 1);
  123       }
  124     }
  125   
  126     /**
  127      * Delete suffix e if in R1 and
  128      * preceded by a non-vowel, and then undouble the ending
  129      *
  130      * @param sb String being stemmed
  131      */
  132     private void step2(StringBuilder sb) {
  133       _removedE = false;
  134       if (_R1 >= sb.length())
  135         return;
  136       String s = sb.toString();
  137       int index = s.length() - 1;
  138       if (index >= _R1 &&
  139           s.endsWith("e") &&
  140           !isVowel(sb.charAt(index - 1))) {
  141         sb.delete(index, index + 1);
  142         unDouble(sb);
  143         _removedE = true;
  144       }
  145     }
  146   
  147     /**
  148      * Delete "heid"
  149      *
  150      * @param sb String being stemmed
  151      */
  152     private void step3a(StringBuilder sb) {
  153       if (_R2 >= sb.length())
  154         return;
  155       String s = sb.toString();
  156       int index = s.length() - 4;
  157       if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') {
  158         sb.delete(index, index + 4); //remove heid
  159         enEnding(sb);
  160       }
  161     }
  162   
  163     /**
  164      * <p>A d-suffix, or derivational suffix, enables a new word,
  165      * often with a different grammatical category, or with a different
  166      * sense, to be built from another word. Whether a d-suffix can be
  167      * attached is discovered not from the rules of grammar, but by
  168      * referring to a dictionary. So in English, ness can be added to
  169      * certain adjectives to form corresponding nouns (littleness,
  170      * kindness, foolishness ...) but not to all adjectives
  171      * (not for example, to big, cruel, wise ...) d-suffixes can be
  172      * used to change meaning, often in rather exotic ways.</p>
  173      * Remove "ing", "end", "ig", "lijk", "baar" and "bar"
  174      *
  175      * @param sb String being stemmed
  176      */
  177     private void step3b(StringBuilder sb) {
  178       if (_R2 >= sb.length())
  179         return;
  180       String s = sb.toString();
  181       int index = 0;
  182   
  183       if ((s.endsWith("end") || s.endsWith("ing")) &&
  184           (index = s.length() - 3) >= _R2) {
  185         sb.delete(index, index + 3);
  186         if (sb.charAt(index - 2) == 'i' &&
  187             sb.charAt(index - 1) == 'g') {
  188           if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) {
  189             index -= 2;
  190             sb.delete(index, index + 2);
  191           }
  192         } else {
  193           unDouble(sb, index);
  194         }
  195         return;
  196       }
  197       if (s.endsWith("ig") &&
  198           (index = s.length() - 2) >= _R2
  199       ) {
  200         if (sb.charAt(index - 1) != 'e')
  201           sb.delete(index, index + 2);
  202         return;
  203       }
  204       if (s.endsWith("lijk") &&
  205           (index = s.length() - 4) >= _R2
  206       ) {
  207         sb.delete(index, index + 4);
  208         step2(sb);
  209         return;
  210       }
  211       if (s.endsWith("baar") &&
  212           (index = s.length() - 4) >= _R2
  213       ) {
  214         sb.delete(index, index + 4);
  215         return;
  216       }
  217       if (s.endsWith("bar") &&
  218           (index = s.length() - 3) >= _R2
  219       ) {
  220         if (_removedE)
  221           sb.delete(index, index + 3);
  222         return;
  223       }
  224     }
  225   
  226     /**
  227      * undouble vowel
  228      * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
  229      *
  230      * @param sb String being stemmed
  231      */
  232     private void step4(StringBuilder sb) {
  233       if (sb.length() < 4)
  234         return;
  235       String end = sb.substring(sb.length() - 4, sb.length());
  236       char c = end.charAt(0);
  237       char v1 = end.charAt(1);
  238       char v2 = end.charAt(2);
  239       char d = end.charAt(3);
  240       if (v1 == v2 &&
  241           d != 'I' &&
  242           v1 != 'i' &&
  243           isVowel(v1) &&
  244           !isVowel(d) &&
  245           !isVowel(c)) {
  246         sb.delete(sb.length() - 2, sb.length() - 1);
  247       }
  248     }
  249   
  250     /**
  251      * Checks if a term could be stemmed.
  252      *
  253      * @return true if, and only if, the given term consists in letters.
  254      */
  255     private boolean isStemmable(String term) {
  256       for (int c = 0; c < term.length(); c++) {
  257         if (!Character.isLetter(term.charAt(c))) return false;
  258       }
  259       return true;
  260     }
  261   
  262     /**
  263      * Substitute , , , , ,  , , , , 
  264      */
  265     private void substitute(StringBuilder buffer) {
  266       for (int i = 0; i < buffer.length(); i++) {
  267         switch (buffer.charAt(i)) {
  268           case '':
  269           case '':
  270             {
  271               buffer.setCharAt(i, 'a');
  272               break;
  273             }
  274           case '':
  275           case '':
  276             {
  277               buffer.setCharAt(i, 'e');
  278               break;
  279             }
  280           case '':
  281           case '':
  282             {
  283               buffer.setCharAt(i, 'u');
  284               break;
  285             }
  286           case '':
  287           case 'i':
  288             {
  289               buffer.setCharAt(i, 'i');
  290               break;
  291             }
  292           case '':
  293           case '':
  294             {
  295               buffer.setCharAt(i, 'o');
  296               break;
  297             }
  298         }
  299       }
  300     }
  301   
  302     /*private boolean isValidSEnding(StringBuilder sb) {
  303       return isValidSEnding(sb, sb.length() - 1);
  304     }*/
  305   
  306     private boolean isValidSEnding(StringBuilder sb, int index) {
  307       char c = sb.charAt(index);
  308       if (isVowel(c) || c == 'j')
  309         return false;
  310       return true;
  311     }
  312   
  313     /*private boolean isValidEnEnding(StringBuilder sb) {
  314       return isValidEnEnding(sb, sb.length() - 1);
  315     }*/
  316   
  317     private boolean isValidEnEnding(StringBuilder sb, int index) {
  318       char c = sb.charAt(index);
  319       if (isVowel(c))
  320         return false;
  321       if (c < 3)
  322         return false;
  323       // ends with "gem"?
  324       if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e')
  325         return false;
  326       return true;
  327     }
  328   
  329     private void unDouble(StringBuilder sb) {
  330       unDouble(sb, sb.length());
  331     }
  332   
  333     private void unDouble(StringBuilder sb, int endIndex) {
  334       String s = sb.substring(0, endIndex);
  335       if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) {
  336         sb.delete(endIndex - 1, endIndex);
  337       }
  338     }
  339   
  340     private int getRIndex(StringBuilder sb, int start) {
  341       if (start == 0)
  342         start = 1;
  343       int i = start;
  344       for (; i < sb.length(); i++) {
  345         //first non-vowel preceded by a vowel
  346         if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) {
  347           return i + 1;
  348         }
  349       }
  350       return i + 1;
  351     }
  352   
  353     private void storeYandI(StringBuilder sb) {
  354       if (sb.charAt(0) == 'y')
  355         sb.setCharAt(0, 'Y');
  356   
  357       int last = sb.length() - 1;
  358   
  359       for (int i = 1; i < last; i++) {
  360         switch (sb.charAt(i)) {
  361           case 'i':
  362             {
  363               if (isVowel(sb.charAt(i - 1)) &&
  364                   isVowel(sb.charAt(i + 1))
  365               )
  366                 sb.setCharAt(i, 'I');
  367               break;
  368             }
  369           case 'y':
  370             {
  371               if (isVowel(sb.charAt(i - 1)))
  372                 sb.setCharAt(i, 'Y');
  373               break;
  374             }
  375         }
  376       }
  377       if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1)))
  378         sb.setCharAt(last, 'Y');
  379     }
  380   
  381     private void reStoreYandI(StringBuilder sb) {
  382       String tmp = sb.toString();
  383       sb.delete(0, sb.length());
  384       sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y"));
  385     }
  386   
  387     private boolean isVowel(char c) {
  388       switch (c) {
  389         case 'e':
  390         case 'a':
  391         case 'o':
  392         case 'i':
  393         case 'u':
  394         case 'y':
  395         case '':
  396           {
  397             return true;
  398           }
  399       }
  400       return false;
  401     }
  402   
  403     void setStemDictionary(Map dict) {
  404       _stemDict = dict;
  405     }
  406   
  407   }

Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » nl » [javadoc | source]