UserAgentUtils抓取浏览器操作系统数据

Posted CaoPengCheng&

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了UserAgentUtils抓取浏览器操作系统数据相关的知识,希望对你有一定的参考价值。

UserAgentUtils抓取浏览器、操作系统数据

一,UserAgentUtils介绍

user-agent-utils 是一个用来解析 User-Agent 字符串的 Java 类库。
其能够识别的内容包括:
超过150种不同的浏览器;
7种不同的浏览器类型;
超过60种不同的操作系统;
6种不同的设备类型;
9种不同的渲染引擎;
9种不同的Web应用,如HttpClientBot

GitHub主页

二,POM导包

<!-- 解析客户端操作系统、浏览器等 -->
<dependency>
    <groupId>eu.bitwalker</groupId>
    <artifactId>UserAgentUtils</artifactId>
    <version>1.21</version>
</dependency>      

三,使用

	 /*
     * 用法一
     */
     final UserAgent userAgent = UserAgent.parseUserAgentString(ServletUtils.getRequest().getHeader("User-Agent"));
     final String ip = IpUtils.getIpAddr(ServletUtils.getRequest());
     String os = userAgent.getOperatingSystem().getName();// 获取客户端操作系统
     String browser = userAgent.getBrowser().getName(); // 获取客户端浏览器
     System.out.println(ip);
     System.out.println(os);
     System.out.println(browser);

	/*
     * 用法二
     */
    String agent=request.getHeader("User-Agent");
    //"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/43.0.2357.81 Safari/537.36"
    UserAgent userAgent = UserAgent.parseUserAgentString(agent);//解析agent字符串
    Browser browser = userAgent.getBrowser();//获取客户端操作系统
    OperatingSystem operatingSystem = userAgent.getOperatingSystem();//获取客户端浏览器
	/*
     * 浏览器
     */
    System.out.println("浏览器:"+browser.getName());
    System.out.println("类型:"+browser.getBrowserType());
    System.out.println("家族:"+browser.getGroup());
    System.out.println("厂商:"+browser.getManufacturer());
    System.out.println("引擎:"+browser.getRenderingEngine());
    System.out.println("版本:"+userAgent.getBrowserVersion());
     /*
     * 操作系统
     */
    System.out.println("操作系统:"+operatingSystem.getName());
    System.out.println("类型:"+operatingSystem.getDeviceType());
    System.out.println("家族:"+operatingSystem.getGroup());
    System.out.println("厂商:"+operatingSystem.getManufacturer());

四,相关文件

IpUtils

package com.hcepms.base.util.http;

import com.hcepms.base.util.http.EscapeUtil;

import javax.servlet.http.HttpServletRequest;
import java.net.InetAddress;
import java.net.UnknownHostException;

/**
 * 获取IP方法
 *
 * @author CaoPengCheng
 */
public class IpUtils 
    public static String getIpAddr(HttpServletRequest request) 
        if (request == null) 
            return "unknown";
        
        String ip = request.getHeader("x-forwarded-for");
        if (ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) 
            ip = request.getHeader("Proxy-Client-IP");
        
        if (ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) 
            ip = request.getHeader("X-Forwarded-For");
        
        if (ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) 
            ip = request.getHeader("WL-Proxy-Client-IP");
        
        if (ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) 
            ip = request.getHeader("X-Real-IP");
        

        if (ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) 
            ip = request.getRemoteAddr();
        
        return "0:0:0:0:0:0:0:1".equals(ip) ? "127.0.0.1" : EscapeUtil.clean(ip);
    

    public static boolean internalIp(String ip) 
        byte[] addr = textToNumericFormatV4(ip);
        return internalIp(addr) || "127.0.0.1".equals(ip);
    

    private static boolean internalIp(byte[] addr) 
        if (StringUtils.isNull(addr) || addr.length < 2) 
            return true;
        
        final byte b0 = addr[0];
        final byte b1 = addr[1];
        // 10.x.x.x/8
        final byte SECTION_1 = 0x0A;
        // 172.16.x.x/12
        final byte SECTION_2 = (byte) 0xAC;
        final byte SECTION_3 = (byte) 0x10;
        final byte SECTION_4 = (byte) 0x1F;
        // 192.168.x.x/16
        final byte SECTION_5 = (byte) 0xC0;
        final byte SECTION_6 = (byte) 0xA8;
        switch (b0) 
            case SECTION_1:
                return true;
            case SECTION_2:
                if (b1 >= SECTION_3 && b1 <= SECTION_4) 
                    return true;
                
            case SECTION_5:
                switch (b1) 
                    case SECTION_6:
                        return true;
                
            default:
                return false;
        
    

    /**
     * 将IPv4地址转换成字节
     *
     * @param text IPv4地址
     * @return byte 字节
     */
    public static byte[] textToNumericFormatV4(String text) 
        if (text.length() == 0) 
            return null;
        

        byte[] bytes = new byte[4];
        String[] elements = text.split("\\\\.", -1);
        try 
            long l;
            int i;
            switch (elements.length) 
                case 1:
                    l = Long.parseLong(elements[0]);
                    if ((l < 0L) || (l > 4294967295L)) 
                        return null;
                    
                    bytes[0] = (byte) (int) (l >> 24 & 0xFF);
                    bytes[1] = (byte) (int) ((l & 0xFFFFFF) >> 16 & 0xFF);
                    bytes[2] = (byte) (int) ((l & 0xFFFF) >> 8 & 0xFF);
                    bytes[3] = (byte) (int) (l & 0xFF);
                    break;
                case 2:
                    l = Integer.parseInt(elements[0]);
                    if ((l < 0L) || (l > 255L)) 
                        return null;
                    
                    bytes[0] = (byte) (int) (l & 0xFF);
                    l = Integer.parseInt(elements[1]);
                    if ((l < 0L) || (l > 16777215L)) 
                        return null;
                    
                    bytes[1] = (byte) (int) (l >> 16 & 0xFF);
                    bytes[2] = (byte) (int) ((l & 0xFFFF) >> 8 & 0xFF);
                    bytes[3] = (byte) (int) (l & 0xFF);
                    break;
                case 3:
                    for (i = 0; i < 2; ++i) 
                        l = Integer.parseInt(elements[i]);
                        if ((l < 0L) || (l > 255L)) 
                            return null;
                        
                        bytes[i] = (byte) (int) (l & 0xFF);
                    
                    l = Integer.parseInt(elements[2]);
                    if ((l < 0L) || (l > 65535L)) 
                        return null;
                    
                    bytes[2] = (byte) (int) (l >> 8 & 0xFF);
                    bytes[3] = (byte) (int) (l & 0xFF);
                    break;
                case 4:
                    for (i = 0; i < 4; ++i) 
                        l = Integer.parseInt(elements[i]);
                        if ((l < 0L) || (l > 255L)) 
                            return null;
                        
                        bytes[i] = (byte) (int) (l & 0xFF);
                    
                    break;
                default:
                    return null;
            
         catch (NumberFormatException e) 
            return null;
        
        return bytes;
    

    public static String getHostIp() 
        try 
            return InetAddress.getLocalHost().getHostAddress();
         catch (UnknownHostException e) 
        
        return "127.0.0.1";
    

    public static String getHostName() 
        try 
            return InetAddress.getLocalHost().getHostName();
         catch (UnknownHostException e) 
        
        return "未知";
    

HTMLFilter

package com.hcepms.base.util.http;

import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * HTML过滤器,用于去除XSS漏洞隐患。
 *
 * @author CaoPengCheng
 */
public final class HTMLFilter 
    /**
     * regex flag union representing /si modifiers in php
     **/
    private static final int REGEX_FLAGS_SI = Pattern.CASE_INSENSITIVE | Pattern.DOTALL;
    private static final Pattern P_COMMENTS = Pattern.compile("<!--(.*?)-->", Pattern.DOTALL);
    private static final Pattern P_COMMENT = Pattern.compile("^!--(.*)--$", REGEX_FLAGS_SI);
    private static final Pattern P_TAGS = Pattern.compile("<(.*?)>", Pattern.DOTALL);
    private static final Pattern P_END_TAG = Pattern.compile("^/([a-z0-9]+)", REGEX_FLAGS_SI);
    private static final Pattern P_START_TAG = Pattern.compile("^([a-z0-9]+)(.*?)(/?)$", REGEX_FLAGS_SI);
    private static final Pattern P_QUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)=([\\"'])(.*?)\\\\2", REGEX_FLAGS_SI);
    private static final Pattern P_UNQUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)(=)([^\\"\\\\s']+)", REGEX_FLAGS_SI);
    private static final Pattern P_PROTOCOL = Pattern.compile("^([^:]+):", REGEX_FLAGS_SI);
    private static final Pattern P_ENTITY = Pattern.compile("&#(\\\\d+);?");
    private static final Pattern P_ENTITY_UNICODE = Pattern.compile("&#x([0-9a-f]+);?");
    private static final Pattern P_ENCODE = Pattern.compile("%([0-9a-f]2);?");
    private static final Pattern P_VALID_ENTITIES = Pattern.compile("&([^&;]*)(?=(;|&|$))");
    private static final Pattern P_VALID_QUOTES = Pattern.以上是关于UserAgentUtils抓取浏览器操作系统数据的主要内容,如果未能解决你的问题,请参考以下文章

UserAgentUtils(浏览器解析工具)使用总结

UserAgentUtils获取浏览器信息

Request获取浏览器信息

怎么用VBA或网络爬虫程序抓取网站数据

用webbrowser操作浏览器登录微博后,怎么抓取数据,Python语言

UiBot无法抓取Google Chrome元素和数据抓取工具无法使用的解决方案