java词频统计——web版支持
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了java词频统计——web版支持相关的知识,希望对你有一定的参考价值。
需求概要:
1.把程序迁移到web平台,通过用户上传TXT的方式接收文件。
2.用户直接输入要统计的文本,服务器返回结果
3.在页面上给出链接 (如果有封皮、作者、字数、页数等信息更佳)或表格,展示经典英文小说词频统计结果;
4.支持用户自定义单词分隔符;
5.词汇范围对比(额外项)。
分析和设计:
1.创建web工程,利用servlet上传文件的技术实现用户向服务器上传文件。页面设置表单类型为enctype="multipart/form-data",创建文件上传文本框<input type="file" id="upfilename" name="upfilename" value="" />,服务器端使用Part p = request.getPart("upfilename");获取上传的文件,然后写入到指定地址即可。
2.直接分析用户post到服务器的内容,为了使用原有的api,可以将输入内容写到文件中,再进行分析。
3.页面展示统计结果
4.用户可以输入自定义的分隔符和设置显示统计结果前10行(可修改)。需要修改原词频统计的有效字符函数。
5.暂时不考虑
部分代码实现:
表单实现
1 <div align="center" id="txtform"> 2 <form action="upload" method="post" enctype="multipart/form-data"> 3 <input type="file" id="upfilename" name="upfilename" value="" /> 自定义分隔符<input 4 type="text" id="splitter" name="splitter"> <input 5 type="submit" id="submit" value="上传" /> 6 </form> 7 </div> 8 <div align="center" id="txtform"> 9 <form action="wordcount" method="post"> 10 <div align="center">待统计内容</div> 11 <textarea name="content" id="content" 12 style="width: 700px; height: 200px;"></textarea> 13 <br> 统计前<select id="num" name="num"> 14 <option value="10">10</option> 15 <option value="20">20</option> 16 <option value="0">所有</option> 17 </select>项 <br>自定义分隔符<input type="text" id="splitter" name="splitter"> <input 18 type="submit" value="提交" /> <input type="button" 19 onclick="if(confirm(‘确认重置?‘)){reset()}" value="重置"> 20 </form> 21 </div>
文件上传:
1 request.setCharacterEncoding("UTF-8"); 2 response.setCharacterEncoding("UTF-8"); 3 response.setContentType("text/html"); 4 PrintWriter out = response.getWriter(); 5 byte b[] = new byte[2048]; 6 @SuppressWarnings("unused") 7 int len = 0; 8 Part p = request.getPart("upfilename"); 9 if(p==null){ 10 System.out.println("p == null"); 11 } 12 String splitter = request.getParameter("splitter"); 13 InputStream in = p.getInputStream(); 14 String name = ""+System.currentTimeMillis(); 15 FileWriter fr = new FileWriter("D:\\upload\\" + name+".txt"); 16 while ((len = in.read(b)) > 0) { 17 fr.write(new String(b)); 18 } 19 fr.close(); 20 out.println("uploaded"); 21 response.sendRedirect("wordcount?id="+name+"&splitter"+splitter); 22 out.flush(); 23 out.close();
servlet处理:
1 protected void doGet(HttpServletRequest request, HttpServletResponse response) 2 throws ServletException, IOException { 3 request.setCharacterEncoding("UTF-8"); 4 response.setCharacterEncoding("UTF-8"); 5 response.setContentType("text/html"); 6 PrintWriter out = response.getWriter(); 7 String id = request.getParameter("id"); 8 int num = 10; 9 String filename = "D:\\upload\\" + id + ".txt"; 10 WordUtil wu = WordUtilFactory.getWordUtil(); 11 long start = System.currentTimeMillis(); 12 String splitter = request.getParameter("splitter"); 13 wu.setSplitter(splitter); 14 List<String[]> result = wu.getSortedWordGroupCountBuffered(filename, splitter); 15 int size = result.size(); 16 for (int i = 0; i < (size > num ? num == 0 ? size : num : size); i++) { 17 String[] strs = result.get(i); 18 out.println(strs[1] + " : " + strs[0] + "<br>"); 19 } 20 long end = System.currentTimeMillis(); 21 out.println("execution time :" + (end - start) + "ms"); 22 out.flush(); 23 out.close(); 24 } 25 26 protected void doPost(HttpServletRequest request, HttpServletResponse response) 27 throws ServletException, IOException { 28 request.setCharacterEncoding("UTF-8"); 29 response.setCharacterEncoding("UTF-8"); 30 response.setContentType("text/html"); 31 PrintWriter out = response.getWriter(); 32 String content = request.getParameter("content"); 33 String numStr = request.getParameter("num"); 34 int num = 10; 35 if (numStr != null) { 36 num = Integer.parseInt(numStr); 37 } 38 WordUtil wu = WordUtilFactory.getWordUtil(); 39 40 long start = System.currentTimeMillis(); 41 String filename = "D://tmp.txt"; 42 43 FileWriter fr = new FileWriter(filename); 44 fr.write(content); 45 fr.close(); 46 String splitter = request.getParameter("splitter"); 47 wu.setSplitter(splitter); 48 List<String[]> result = wu.getSortedWordGroupCountBuffered(filename, splitter); 49 int size = result.size(); 50 for (int i = 0; i < (size > num ? num == 0 ? size : num : size); i++) { 51 String[] strs = result.get(i); 52 out.println(strs[1] + " : " + strs[0] + "<br>"); 53 } 54 long end = System.currentTimeMillis(); 55 out.println("execution time :" + (end - start) + "ms"); 56 out.flush(); 57 out.close(); 58 }
有效字符判定(即自定义分隔符)
1 public void setSplitter(String splitter) { 2 char[] tmp = splitter.toCharArray(); 3 ArrayList<Character> deleted = new ArrayList<>(); 4 for(int i=0;i<tmp.length-1;i++){ 5 if(tmp[i]==‘\\‘){ 6 char c = tmp[i+1]; 7 if(c==‘n‘){ 8 deleted.add(‘\n‘); 9 } 10 if(c==‘r‘){ 11 deleted.add(‘\n‘); 12 } 13 if(c==‘t‘){ 14 deleted.add(‘\n‘); 15 } 16 char[] copy = new char[tmp.length-2]; 17 for(int j = 0;j <i;j++){ 18 copy[j]=tmp[j]; 19 } 20 for(int j=i;j<tmp.length-2;j++){ 21 copy[j]=tmp[j+2]; 22 } 23 i++; 24 } 25 } 26 split = new char[tmp.length+deleted.size()]; 27 for(int i = 0;i<tmp.length;i++){ 28 split[i]=tmp[i]; 29 } 30 for(int i=tmp.length;i<split.length;i++){ 31 split[i]=deleted.get(split.length-tmp.length-1); 32 } 33 } 34 35 private int isCharacter(char ch, String splitter) { 36 if (split == null) { 37 if ((ch >= ‘a‘ && ch <= ‘z‘)) 38 return 1; 39 if ((ch >= ‘A‘ && ch <= ‘Z‘)) 40 return 1; 41 if (ch >= ‘0‘ && ch <= ‘9‘) 42 return 2; 43 return 0; 44 } 45 if (split.equals("")) { 46 if ((ch >= ‘a‘ && ch <= ‘z‘)) 47 return 1; 48 if ((ch >= ‘A‘ && ch <= ‘Z‘)) 49 return 1; 50 if (ch >= ‘0‘ && ch <= ‘9‘) 51 return 2; 52 return 0; 53 } 54 for (int i = 0; i < split.length; i++) { 55 if (ch == split[i]) { 56 return 0; 57 } 58 } 59 if ((ch >= ‘a‘ && ch <= ‘z‘)) 60 return 1; 61 if ((ch >= ‘A‘ && ch <= ‘Z‘)) 62 return 1; 63 if (ch >= ‘0‘ && ch <= ‘9‘) 64 return 2; 65 return 1; 66 }
web版工程地址:https://git.coding.net/jx8zjs/wordcount-web.git
ssh: [email protected]:jx8zjs/wordcount-web.git
console版工程地址:https://coding.net/u/jx8zjs/p/wordCount/git
ssh: [email protected]:jx8zjs/wordCount.git
以上是关于java词频统计——web版支持的主要内容,如果未能解决你的问题,请参考以下文章