在Tomcat中自动检测URI编码

Posted

技术标签:

【中文标题】在Tomcat中自动检测URI编码【英文标题】:Detect the URI encoding automatically in Tomcat 【发布时间】:2011-02-09 02:15:34 【问题描述】:

我有一个正在运行的 Apache Tomcat 6.x 实例,我希望它能够比默认行为更智能地解释传入 URL 的字符集。我特别想实现如下映射:

So%DFe => Soße
So%C3%9Fe => Soße
So%DF%C3%9F => (error)

我想要的行为可以描述为“尝试将字节流解码为 UTF-8,如果它不起作用则假设 ISO-8859-1”。

在这种情况下,仅使用 URIEncoding 配置不起作用。那么如何配置 Tomcat 以按照我想要的方式对请求进行编码呢?

我可能需要编写一个过滤器来接收请求(尤其是查询字符串)并将其重新编码为参数。那是自然的方式吗?

【问题讨论】:

【参考方案1】:

实现我的目标的复杂方法确实是编写我自己的javax.servlet.Filter 并将其嵌入到过滤器链中。此解决方案符合Tomcat Wiki - Character Encoding Issues 中提供的 Apache Tomcat 建议。

更新 (2010-07-31): 此过滤器的第一个版本解释了查询字符串本身,这是一个坏主意。它不能正确处理POST 请求,并且在与其他 servlet 过滤器(如 URL 重写)结合使用时会出现问题。这个版本改为包装最初提供的参数并重新编码。要使其正常工作,URIEncoding(例如在 Tomcat 中)必须配置为 ISO-8859-1

package de.roland_illig.webapps.webapp1;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.FilterConfig;
import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletRequestWrapper;
import javax.servlet.http.HttpServletResponse;

/**
 * Automatically determines the encoding of the request parameters. It assumes
 * that the parameters of the original request are encoded by a 1:1 mapping from
 * bytes to characters.
 * <p>
 * If the request parameters cannot be decoded by any of the given encodings,
 * the filter chain is not processed further, but a status code of 400 with a
 * helpful error message is returned instead.
 * <p>
 * The filter can be configured using the following parameters:
 * <ul>
 * <li>@code encodings: The comma-separated list of encodings (see
 * @link Charset#forName(String)) that are tried in order. The first one that
 * can decode the complete query string is taken.
 * <p>
 * Default value: @code UTF-8
 * <p>
 * Example: @code UTF-8,EUC-KR,ISO-8859-15.
 * <li>@code inputEncodingParameterName: When this parameter is defined and a
 * query parameter of that name is provided by the client, and that parameter's
 * value contains only non-escaped characters and the server knows an encoding
 * of that name, then it is used exclusively, overriding the @code encodings
 * parameter for this request.
 * <p>
 * Default value: @code null
 * <p>
 * Example: @code ie (as used by Google).
 * </ul>
 */
public class EncodingFilter implements Filter 

  private static final Pattern PAT_COMMA = Pattern.compile(",\\s*");

  private String inputEncodingParameterName = null;
  private final List<Charset> encodings = new ArrayList<Charset>();

  @Override
  @SuppressWarnings("unchecked")
  public void init(FilterConfig config) throws ServletException 
    String encodingsStr = "UTF-8";

    Enumeration<String> en = config.getInitParameterNames();
    while (en.hasMoreElements()) 
      final String name = en.nextElement();
      final String value = config.getInitParameter(name);
      if (name.equals("encodings")) 
        encodingsStr = value;
       else if (name.equals("inputEncodingParameterName")) 
        inputEncodingParameterName = value;
       else 
        throw new IllegalArgumentException("Unknown parameter: " + name);
      
    

    for (String encoding : PAT_COMMA.split(encodingsStr)) 
      Charset charset = Charset.forName(encoding);
      encodings.add(charset);
    
  

  @SuppressWarnings("unchecked")
  @Override
  public void doFilter(ServletRequest sreq, ServletResponse sres, FilterChain fc) throws IOException, ServletException 
    final HttpServletRequest req = (HttpServletRequest) sreq;
    final HttpServletResponse res = (HttpServletResponse) sres;

    final Map<String, String[]> params;
    try 
      params = Util.decodeParameters(req.getParameterMap(), encodings, inputEncodingParameterName);
     catch (IOException e) 
      res.sendError(400, e.getMessage());
      return;
    

    HttpServletRequest wrapper = new ParametersWrapper(req, params);
    fc.doFilter(wrapper, res);
  

  @Override
  public void destroy() 
    // nothing to do
  

  static abstract class Util 

    static CharsetDecoder strictDecoder(Charset cs) 
      CharsetDecoder dec = cs.newDecoder();
      dec.onMalformedInput(CodingErrorAction.REPORT);
      dec.onUnmappableCharacter(CodingErrorAction.REPORT);
      return dec;
    

    static int[] toCodePoints(String str) 
      final int len = str.length();
      int[] codePoints = new int[len];
      int i = 0, j = 0;
      while (i < len) 
        int cp = Character.codePointAt(str, i);
        codePoints[j++] = cp;
        i += Character.charCount(cp);
      
      return j == len ? codePoints : Arrays.copyOf(codePoints, len);
    

    public static String recode(String encoded, CharsetDecoder decoder) throws IOException 
      byte[] bytes = new byte[encoded.length()];
      int bytescount = 0;

      for (int i = 0; i < encoded.length(); i++) 
        char c = encoded.charAt(i);
        if (!(c <= '\u00FF'))
          throw new IOException("Invalid character: #" + (int) c);
        bytes[bytescount++] = (byte) c;
      

      CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(bytes, 0, bytescount));
      String result = cbuf.toString();
      return result;
    

    static String ensureDefinedUnicode(String s) throws IOException 
      for (int cp : toCodePoints(s)) 
        if (!Character.isDefined(cp))
          throw new IOException("Undefined unicode code point: " + cp);
      
      return s;
    

    static Map<String, String[]> decodeParameters(Map<String, String[]> originalParams, List<Charset> charsets, String ieName) throws IOException 
      Map<String, String[]> params = new LinkedHashMap<String, String[]>();

      Charset ie = null;
      
        String[] values = originalParams.get(ieName);
        if (values != null) 
          for (String value : values) 
            if (!value.isEmpty() && value.indexOf('%') == -1) 
              try 
                if (ie != null)
                  throw new IOException("Duplicate value for input encoding parameter: " + ie + " and " + value + ".");
                ie = Charset.forName(value);
               catch (IllegalCharsetNameException e) 
                throw new IOException("Illegal input encoding name: " + value);
               catch (UnsupportedCharsetException e) 
                throw new IOException("Unsupported input encoding: " + value);
              
            
          
        
      

      Charset[] css = (ie != null) ? new Charset[]  ie  : charsets.toArray(new Charset[charsets.size()]);
      for (Charset charset : css) 
        try 
          params.clear();
          CharsetDecoder decoder = strictDecoder(charset);
          for (Map.Entry<String, String[]> entry : originalParams.entrySet()) 
            final String encodedName = entry.getKey();
            final String name = ensureDefinedUnicode(Util.recode(encodedName, decoder));
            for (final String encodedValue : entry.getValue()) 
              final String value = ensureDefinedUnicode(Util.recode(encodedValue, decoder));
              String[] oldValues = params.get(name);
              String[] newValues = (oldValues == null) ? new String[1] : Arrays.copyOf(oldValues, oldValues.length + 1);
              newValues[newValues.length - 1] = value;
              params.put(name, newValues);
            
          
          return params;
         catch (IOException e) 
          continue;
        
      

      List<String> kvs = new ArrayList<String>();
      for (Map.Entry<String, String[]> entry : originalParams.entrySet()) 
        final String key = entry.getKey();
        for (final String value : entry.getValue()) 
          kvs.add(key + "=" + value);
        
      
      throw new IOException("Could not decode the parameters: " + kvs.toString());
    
  

  @SuppressWarnings("unchecked")
  static class ParametersWrapper extends HttpServletRequestWrapper 

    private final Map<String, String[]> params;

    public ParametersWrapper(HttpServletRequest request, Map<String, String[]> params) 
      super(request);
      this.params = params;
    

    @Override
    public String getParameter(String name) 
      String[] values = params.get(name);
      return (values != null && values.length != 0) ? values[0] : null;
    

    @Override
    public Map getParameterMap() 
      return Collections.unmodifiableMap(params);
    

    @Override
    public Enumeration getParameterNames() 
      return Collections.enumeration(params.keySet());
    

    @Override
    public String[] getParameterValues(String name) 
      return params.get(name);
    
  

虽然代码相当小,但有一些实现细节可能会出错,所以我原以为 Tomcat 已经提供了类似的过滤器。

要激活此过滤器,我已将以下内容添加到我的web.xml

<filter>
  <filter-name>EncodingFilter</filter-name>
  <filter-class>de.roland_illig.webapps.webapp1.EncodingFilter</filter-class>
  <init-param>
    <param-name>encodings</param-name>
    <param-value>US-ASCII, UTF-8, EUC-KR, ISO-8859-15, ISO-8859-1</param-value>
  </init-param>
  <init-param>
    <param-name>inputEncodingParameterName</param-name>
    <param-value>ie</param-value>
  </init-param>
</filter>

<filter-mapping>
  <filter-name>EncodingFilter</filter-name>
  <url-pattern>/*</url-pattern>
</filter-mapping>

【讨论】:

【参考方案2】:

我们已经在 SGES2.1.1 上做了类似 Roland 的解决方案(我认为它使用 catalina 和一些旧的 Tomcat 一样),但它有一些问题:

    它复制了应用服务器所做的事情 它还必须注意内部 JSP 请求,包括带有参数的页面 ... 它必须解析查询字符串 每次调用 setRequest 时都必须再次执行所有操作,但稍后,因为 2。 解决方法太繁重

今天,在阅读了许多博客和建议后,我删除了整个类,只做了一件简单的事情:从包装器构造函数中的 Content-Type 标头解析字符集并将其设置为包装实例。

成功了,我们所有的 988 测试都成功了。

private static final Pattern CHARSET_PATTERN 
    = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");
private static final String CHARSET_DEFAULT = "ISO-8859-2";

public CisHttpRequestWrapper(final HttpServletRequest request) 
  super(request);
  if (request.getCharacterEncoding() != null) 
    return;
  
  final String charset = parseCharset(request);
  try 
    setCharacterEncoding(charset);
   catch (final UnsupportedEncodingException e) 
    throw new IllegalStateException("Unknown charset: " + charset, e);
  


private String parseCharset(final HttpServletRequest request) 
  final String contentType = request.getHeader("Content-Type");
  if (contentType == null || contentType.isEmpty()) 
    return CHARSET_DEFAULT;
  
  final Matcher m = CHARSET_PATTERN.matcher(contentType);
  if (!m.find()) 
    return CHARSET_DEFAULT;
  
  final String charsetName = m.group(1).trim().toUpperCase();
  return charsetName;

【讨论】:

只剩下一个问题:用户登录后原始 HTTP 请求的编码已损坏 - 这是 catalina 中的错误;它不会在 Glassfish4 上发生,也可以使用 sun-web.xml 来防止,其中可以设置默认请求字符集。

以上是关于在Tomcat中自动检测URI编码的主要内容,如果未能解决你的问题,请参考以下文章

自动编码(Autoencoder)器异常检测(outlier detection)实战

自动编码器检测检测信用卡欺诈

findstr 或 grab 自动检测字符编码 (UTF-16)

Java中的自动检测字符编码

golang 使用编码自动检测读取文本文件

python3 爬虫(urllib+beautifulsoup)beautifulsoup自动检测编码错误