微博excel数据清洗(Java版)

Posted 2020-10-13 gaomatlab

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了微博excel数据清洗(Java版)相关的知识，希望对你有一定的参考价值。

微博数据清洗(Java版)

原创 2013年12月10日 10:58:24

大数据公益大学提供的一份数据，义务处理一下，原始数据是Excel，含有html标签，如下：

技术分享

要求清洗掉html标签，和微博内容中的url地址。

主要分为两部分：

1.处理文本，清洗数据。

2.处理excel读写操作。

上代码：

ExcelUtil类，包含Excel2003-2007的读写操作,Excel使用Apache POI进行操作,需要jar包如下：

[java] view plain copy

package dat.datadeal;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
/**
*
* @author daT [email protected]
*2003,2007版excel读写工具
*/
public class ExcelUtil{
/**
* Excel文件读取
* @param filePath
* @return String[]存的是行，List存的是列。
* 一个excel一次全部读入内存(Excel超大需要另行处理)
*/
public List<String[]> readExcel(String filePath) {
List<String[]> dataList = new ArrayList<String[]>();
boolean isExcel2003 = true;
if (isExcel2007(filePath)) {
isExcel2003 = false;
}
File file = new File(filePath);
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException ex) {
Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);
}
Workbook wb = null;
try {
wb = isExcel2003 ? new HSSFWorkbook(is) : new XSSFWorkbook(is);
} catch (IOException ex) {
Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);
}
Sheet sheet = wb.getSheetAt(0);
int totalRows = sheet.getPhysicalNumberOfRows();
int totalCells = 0;
if (totalRows >= 1 && sheet.getRow(0) != null) {
totalCells = sheet.getRow(0).getPhysicalNumberOfCells();
}
for (int r = 0; r < totalRows; r++) {
Row row = sheet.getRow(r);
if (row == null) {
continue;
}
String[] rowList = new String[totalCells];
for (int c = 0; c < totalCells; c++) {
Cell cell = row.getCell(c);
String cellValue = "";
if (cell == null) {
rowList[c] = (cellValue);
continue;
}
cellValue = ConvertCellStr(cell, cellValue);
rowList[c] = (cellValue);
}
dataList.add(rowList);
}
return dataList;
}
private String ConvertCellStr(Cell cell, String cellStr) {
switch (cell.getCellType()) {
case Cell.CELL_TYPE_STRING:
// 读取String
cellStr = cell.getStringCellValue().toString();
break;
case Cell.CELL_TYPE_BOOLEAN:
// 得到Boolean对象的方法
cellStr = String.valueOf(cell.getBooleanCellValue());
break;
case Cell.CELL_TYPE_NUMERIC:
// 先看是否是日期格式
if (DateUtil.isCellDateFormatted(cell)) {
// 读取日期格式
cellStr = formatTime(cell.getDateCellValue().toString());
} else {
// 读取数字
cellStr = String.valueOf(cell.getNumericCellValue());
}
break;
case Cell.CELL_TYPE_FORMULA:
// 读取公式
cellStr = cell.getCellFormula().toString();
break;
}
return cellStr;
}
private boolean isExcel2007(String fileName) {
return fileName.matches("^.+\\.(?i)(xlsx)$");
}
private String formatTime(String s) {
SimpleDateFormat sf = new SimpleDateFormat("EEE MMM dd hh:mm:ss z yyyy", Locale.ENGLISH);
Date date = null;
try {
date = sf.parse(s);
} catch (ParseException ex) {
Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);
}
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String result = sdf.format(date);
return result;
}
/**
* Excel写操作,简单起见还是采用内存数据一次写入
* @param filePath 输出文件路径名
* @param dataList 输出文件内容，List<String>行 List列
* @throws IOException
*/
public void writeExcel(String filePath,List<List<String>> dataList) throws IOException{
HSSFWorkbook wb = new HSSFWorkbook();
HSSFSheet sheet = wb.createSheet("sheet");// 添加sheet
// 表格样式
HSSFCellStyle style = wb.createCellStyle();
style.setAlignment(HSSFCellStyle.ALIGN_CENTER);// 指定单元格居中对齐
// // 边框
// style.setBorderBottom(HSSFCellStyle.BORDER_MEDIUM);
// style.setBorderTop(HSSFCellStyle.BORDER_MEDIUM);
// style.setBorderLeft(HSSFCellStyle.BORDER_MEDIUM);
// style.setBorderRight(HSSFCellStyle.BORDER_MEDIUM);
// //设置字体
// HSSFFont f = wb.createFont();
// f.setFontHeightInPoints((short)10);
// f.setBoldweight(HSSFFont.BOLDWEIGHT_NORMAL);
// style.setFont(f);
// //设置列宽
// sheet.setColumnWidth((short)0, (short)9600);
// sheet.setColumnWidth((short)1, (short)4000);
// sheet.setColumnWidth((short)2, (short)8000);
// sheet.setColumnWidth((short)3, (short)8000);
// 在索引0的位置创建第一行
for (int i = 0; i < dataList.size(); i++) {
HSSFRow row = sheet.createRow(i);
List<String> list = dataList.get(i);
for (int j = 0; j < list.size(); j++) {
HSSFCell cell = row.createCell(j);
cell.setCellValue(list.get(j));
cell.setCellStyle(style);
}
}
// 导出文件
FileOutputStream fout = new FileOutputStream(filePath);
wb.write(fout);
fout.close();
}
}

DataClean类，包含对html标签，信息中url的的清洗。

[java] view plain copy

package dat.datadeal;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @author daT [email protected]
*
*/
public class DataClean {
/**
* 清洗html标签
* @param inputString
* @return
*/
public static String delHtml(String inputString) {
String htmlStr = inputString; // 含html标签的字符串
String textStr = "";
java.util.regex.Pattern p_script;
java.util.regex.Matcher m_script;
java.util.regex.Pattern p_html;
java.util.regex.Matcher m_html;
try {
String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式
String regEx_script = "<[/s]*?script[^>]*?>[/s/S]*?<[/s]*?//[/s]*?script[/s]*?>"; // 定义script的正则表达式{或<script[^>]*?>[/s/S]*?<//script>
p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
m_script = p_script.matcher(htmlStr);
htmlStr = m_script.replaceAll(""); // 过滤script标签
p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll(""); // 过滤html标签
textStr = htmlStr;
} catch (Exception e) {
System.err.println("Html2Text: " + e.getMessage());
}
return textStr;// 返回文本字符串
}
/**
* 处理掉信息中的url地址
*/
public static String dealWithUrl(String str){
String regEx = "[http|https]+[://]+[0-9A-Za-z:/[-]_#[?][=][.][&]]*";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(str);
return m.replaceAll("");
}
public static void main(String[] args) throws IOException{
ExcelUtil excelUtil = new ExcelUtil();
List<List<String>> writeList = new ArrayList<List<String>>();
List<String[]> readList =excelUtil.readExcel("/home/dat/javatest/微博数据_.xlsx");
for(String[] lineArray:readList){
List<String> strList = new ArrayList<String>();
for(String str:lineArray){
String strTmp = DataClean.dealWithUrl(DataClean.delHtml(str));
strList.add(strTmp);
//System.out.println(strTmp);
}
writeList.add(strList);
}
excelUtil.writeExcel("/home/dat/javatest/weibo.xlsx",writeList);
System.out.println("job has finished...........");
}
}

清洗后数据：

以上是关于微博excel数据清洗(Java版)的主要内容，如果未能解决你的问题，请参考以下文章