Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » [javadoc | source]
    1   package org.apache.lucene.analysis;
    2   
    3   import org.apache.lucene.analysis.tokenattributes.TermAttribute;
    4   
    5   /**
    6    * Licensed to the Apache Software Foundation (ASF) under one or more
    7    * contributor license agreements.  See the NOTICE file distributed with
    8    * this work for additional information regarding copyright ownership.
    9    * The ASF licenses this file to You under the Apache License, Version 2.0
   10    * (the "License"); you may not use this file except in compliance with
   11    * the License.  You may obtain a copy of the License at
   12    *
   13    *     http://www.apache.org/licenses/LICENSE-2.0
   14    *
   15    * Unless required by applicable law or agreed to in writing, software
   16    * distributed under the License is distributed on an "AS IS" BASIS,
   17    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   18    * See the License for the specific language governing permissions and
   19    * limitations under the License.
   20    */
   21   
   22   /**
   23    * A filter that replaces accented characters in the ISO Latin 1 character set 
   24    * (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
   25    * <p>
   26    * For instance, '&agrave;' will be replaced by 'a'.
   27    * <p>
   28    * 
   29    * @deprecated If you build a new index, use {@link ASCIIFoldingFilter}
   30    * which covers a superset of Latin 1.
   31    * This class is included for use with existing
   32    * indexes and will be removed in a future release (possibly Lucene 4.0).
   33    */
   34   public final class ISOLatin1AccentFilter extends TokenFilter {
   35     public ISOLatin1AccentFilter(TokenStream input) {
   36       super(input);
   37       termAtt = addAttribute(TermAttribute.class);
   38     }
   39   
   40     private char[] output = new char[256];
   41     private int outputPos;
   42     private TermAttribute termAtt;
   43       
   44     @Override
   45     public final boolean incrementToken() throws java.io.IOException {    
   46       if (input.incrementToken()) {
   47         final char[] buffer = termAtt.termBuffer();
   48         final int length = termAtt.termLength();
   49         // If no characters actually require rewriting then we
   50         // just return token as-is:
   51         for(int i=0;i<length;i++) {
   52           final char c = buffer[i];
   53           if (c >= '\u00c0' && c <= '\uFB06') {
   54             removeAccents(buffer, length);
   55             termAtt.setTermBuffer(output, 0, outputPos);
   56             break;
   57           }
   58         }
   59         return true;
   60       } else
   61         return false;
   62     }
   63   
   64     /**
   65      * To replace accented characters in a String by unaccented equivalents.
   66      */
   67     public final void removeAccents(char[] input, int length) {
   68   
   69       // Worst-case length required:
   70       final int maxSizeNeeded = 2*length;
   71   
   72       int size = output.length;
   73       while (size < maxSizeNeeded)
   74         size *= 2;
   75   
   76       if (size != output.length)
   77         output = new char[size];
   78   
   79       outputPos = 0;
   80   
   81       int pos = 0;
   82   
   83       for (int i=0; i<length; i++, pos++) {
   84         final char c = input[pos];
   85   
   86         // Quick test: if it's not in range then just keep
   87         // current character
   88         if (c < '\u00c0' || c > '\uFB06')
   89           output[outputPos++] = c;
   90         else {
   91           switch (c) {
   92           case '\u00C0' : // 
   93           case '\u00C1' : // 
   94           case '\u00C2' : // 
   95           case '\u00C3' : // 
   96           case '\u00C4' : // 
   97           case '\u00C5' : // 
   98             output[outputPos++] = 'A';
   99             break;
  100           case '\u00C6' : // 
  101             output[outputPos++] = 'A';
  102             output[outputPos++] = 'E';
  103             break;
  104           case '\u00C7' : // 
  105             output[outputPos++] = 'C';
  106             break;
  107           case '\u00C8' : // 
  108           case '\u00C9' : // 
  109           case '\u00CA' : // 
  110           case '\u00CB' : // 
  111             output[outputPos++] = 'E';
  112             break;
  113           case '\u00CC' : // 
  114           case '\u00CD' : // 
  115           case '\u00CE' : // 
  116           case '\u00CF' : // 
  117             output[outputPos++] = 'I';
  118             break;
  119           case '\u0132' : // ?
  120               output[outputPos++] = 'I';
  121               output[outputPos++] = 'J';
  122               break;
  123           case '\u00D0' : // 
  124             output[outputPos++] = 'D';
  125             break;
  126           case '\u00D1' : // 
  127             output[outputPos++] = 'N';
  128             break;
  129           case '\u00D2' : // 
  130           case '\u00D3' : // 
  131           case '\u00D4' : // 
  132           case '\u00D5' : // 
  133           case '\u00D6' : // 
  134           case '\u00D8' : // 
  135             output[outputPos++] = 'O';
  136             break;
  137           case '\u0152' : // ?
  138             output[outputPos++] = 'O';
  139             output[outputPos++] = 'E';
  140             break;
  141           case '\u00DE' : // 
  142             output[outputPos++] = 'T';
  143             output[outputPos++] = 'H';
  144             break;
  145           case '\u00D9' : // 
  146           case '\u00DA' : // 
  147           case '\u00DB' : // 
  148           case '\u00DC' : // 
  149             output[outputPos++] = 'U';
  150             break;
  151           case '\u00DD' : // 
  152           case '\u0178' : // ?
  153             output[outputPos++] = 'Y';
  154             break;
  155           case '\u00E0' : // 
  156           case '\u00E1' : // 
  157           case '\u00E2' : // 
  158           case '\u00E3' : // 
  159           case '\u00E4' : // 
  160           case '\u00E5' : // 
  161             output[outputPos++] = 'a';
  162             break;
  163           case '\u00E6' : // 
  164             output[outputPos++] = 'a';
  165             output[outputPos++] = 'e';
  166             break;
  167           case '\u00E7' : // 
  168             output[outputPos++] = 'c';
  169             break;
  170           case '\u00E8' : // 
  171           case '\u00E9' : // 
  172           case '\u00EA' : // 
  173           case '\u00EB' : // 
  174             output[outputPos++] = 'e';
  175             break;
  176           case '\u00EC' : // 
  177           case '\u00ED' : // 
  178           case '\u00EE' : // 
  179           case '\u00EF' : // 
  180             output[outputPos++] = 'i';
  181             break;
  182           case '\u0133' : // ?
  183               output[outputPos++] = 'i';
  184               output[outputPos++] = 'j';
  185               break;
  186           case '\u00F0' : // 
  187             output[outputPos++] = 'd';
  188             break;
  189           case '\u00F1' : // 
  190             output[outputPos++] = 'n';
  191             break;
  192           case '\u00F2' : // 
  193           case '\u00F3' : // 
  194           case '\u00F4' : // 
  195           case '\u00F5' : // 
  196           case '\u00F6' : // 
  197           case '\u00F8' : // 
  198             output[outputPos++] = 'o';
  199             break;
  200           case '\u0153' : // ?
  201             output[outputPos++] = 'o';
  202             output[outputPos++] = 'e';
  203             break;
  204           case '\u00DF' : // 
  205             output[outputPos++] = 's';
  206             output[outputPos++] = 's';
  207             break;
  208           case '\u00FE' : // 
  209             output[outputPos++] = 't';
  210             output[outputPos++] = 'h';
  211             break;
  212           case '\u00F9' : // 
  213           case '\u00FA' : // 
  214           case '\u00FB' : // 
  215           case '\u00FC' : // 
  216             output[outputPos++] = 'u';
  217             break;
  218           case '\u00FD' : // 
  219           case '\u00FF' : // 
  220             output[outputPos++] = 'y';
  221             break;
  222           case '\uFB00': // ?
  223               output[outputPos++] = 'f';
  224               output[outputPos++] = 'f';
  225               break;
  226           case '\uFB01': // ?
  227               output[outputPos++] = 'f';
  228               output[outputPos++] = 'i';
  229               break;
  230           case '\uFB02': // ?
  231               output[outputPos++] = 'f';
  232               output[outputPos++] = 'l';
  233               break;
  234           // following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
  235   //        case '\uFB03': // ?
  236   //            output[outputPos++] = 'f';
  237   //            output[outputPos++] = 'f';
  238   //            output[outputPos++] = 'i';
  239   //            break;
  240   //        case '\uFB04': // ?
  241   //            output[outputPos++] = 'f';
  242   //            output[outputPos++] = 'f';
  243   //            output[outputPos++] = 'l';
  244   //            break;
  245           case '\uFB05': // ?
  246               output[outputPos++] = 'f';
  247               output[outputPos++] = 't';
  248               break;
  249           case '\uFB06': // ?
  250               output[outputPos++] = 's';
  251               output[outputPos++] = 't';
  252             break;
  253           default :
  254             output[outputPos++] = c;
  255             break;
  256           }
  257         }
  258       }
  259     }
  260   }

Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » [javadoc | source]