selenium 爬取空间说说
Posted tele-share
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了selenium 爬取空间说说相关的知识,希望对你有一定的参考价值。
1 package cn.hb.util;
2
3 import java.io.File;
4 import java.io.FileWriter;
5 import java.io.IOException;
6 import java.util.Set;
7 import java.util.concurrent.TimeUnit;
8 import org.openqa.selenium.By;
9 import org.openqa.selenium.Cookie;
10 import org.openqa.selenium.javascriptExecutor;
11 import org.openqa.selenium.Keys;
12 import org.openqa.selenium.WebDriver;
13 import org.openqa.selenium.WebElement;
14 import org.openqa.selenium.firefox.FirefoxDriver;
15 import org.openqa.selenium.firefox.FirefoxOptions;
16 import org.openqa.selenium.interactions.Actions;
17
18 /**
19 * 爬取说说写入到txt中,爬取100条
20 *
21 * @author tele
22 *
23 */
24 public class QZTwitterCrawler {
25 static String url = "https://user.qzone.qq.com/1350560858";
26 static int maxSize = 100;
27 static int pageSize = 20;
28 static String userName="qq";
29 static String pwd = "密码";
30 public static void main(String[] args) throws InterruptedException, IOException {
31 login();
32 }
33
34 /**
35 * 登录
36 *
37 * @throws InterruptedException
38 * @throws IOException
39 */
40 public static void login() throws InterruptedException, IOException {
41 System.setProperty("webdriver.gecko.driver", "D:/browserdriver/geckodriver.exe");
42
43 FirefoxOptions options = new FirefoxOptions();
44 options.setBinary("F:/ff/firefox.exe");
45
46 WebDriver driver = new FirefoxDriver(options);
47 driver.manage().window().maximize();
48 // 超时
49 try {
50 driver.manage().timeouts().pageLoadTimeout(3, TimeUnit.SECONDS);
51 driver.manage().timeouts().setScriptTimeout(3, TimeUnit.SECONDS);
52 driver.get(url);
53 } catch (Exception e) {
54 System.out.println("所需元素已出现,停止加载页面");
55 } finally {
56 // 切换到登录login
57 driver.switchTo().frame("login_frame");
58
59 WebElement switcher_plogin = driver.findElement(By.id("switcher_plogin"));
60 System.out.println(switcher_plogin.getText());
61 if (switcher_plogin.isDisplayed()) {
62 switcher_plogin.click();
63 }
64 // 用户名
65 driver.findElement(By.id("u")).clear();
66 driver.findElement(By.id("u")).sendKeys(userName);
67
68 // 密码
69 driver.findElement(By.id("p")).clear();
70 driver.findElement(By.id("p")).sendKeys(pwd);
71
72 // 登录
73 try {
74 driver.findElement(By.id("login_button")).click();
75 Thread.sleep(3000);
76 } catch (Exception e) {
77 e.printStackTrace();
78 } finally {
79 if ("https://i.qq.com/".equals(driver.getCurrentUrl())) {
80 System.out.println("登录失败!5秒后再次尝试登录");
81 Thread.sleep(5000);
82 driver.findElement(By.id("login_button")).click();
83 }
84 }
85
86 // 退出frame
87 driver.switchTo().defaultContent();
88
89 System.out.println(driver.getCurrentUrl());
90
91 JavascriptExecutor jsExecutor = (JavascriptExecutor) driver;
92 // 如果有亲密度提示
93
94 try { WebElement fs_guide = driver.findElement(By.xpath(
95 "//div[@id=‘friendship_promote_layer‘]/table[@class=‘tbl-fs-guide‘]//a"
96 )); if(fs_guide != null && fs_guide.isDisplayed()) {
97 fs_guide.click(); } } catch (Exception e) { e.printStackTrace();
98 }finally {
99
100 }
101
102
103 // 点击说说
104 driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_311>a")).click();
105
106 Thread.sleep(2000);
107
108 // 切换到frame
109 driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));
110
111 Thread.sleep(5000);
112
113 // 拼接cookie
114 /* StringBuilder builder = new StringBuilder();
115 Set<Cookie> cookieSet = driver.manage().getCookies();
116 cookieSet.forEach(c -> builder.append(c.getName()).append("=").append(c.getValue()).append("; "));
117 cookies = builder.toString();*/
118
119 // 定位元素
120 saveTwitter(driver);
121
122 System.out.println("内容提取完毕,退出浏览器");
123 driver.quit();
124
125 }
126 }
127
128 /**
129 * 序列化
130 * @param driver
131 * @return
132 * @throws InterruptedException
133 * @throws IOException
134 */
135 public static void saveTwitter(WebDriver driver) throws InterruptedException, IOException {
136 File file = new File("f:/qz/twitter.txt");
137
138 // 文件夹检测
139 if (!file.getParentFile().exists()) {
140 file.mkdirs();
141 } else {
142 file.delete();
143 }
144
145
146
147 FileWriter fileWriter = new FileWriter(file, true);
148
149 String xpath;
150 // 模拟按键进行滚动
151 Actions actions = new Actions(driver);
152
153
154 //说说总量
155 String totalNumStr = driver.findElement(By.xpath("//div[@class=‘feed_num‘]/a")).getText();
156 int totalNum = Integer.parseInt(totalNumStr);
157
158 // 计算页数
159 int totalPage = (int) Math.ceil((double)Math.min(maxSize, totalNum) / (double) pageSize);
160
161 // 构造xpath
162 for (int i = 0; i < totalPage; i++) {
163
164 for (int j = 0; j < pageSize; j++) {
165 xpath = "//ol[@id=‘msgList‘]/li[" + (j + 1) + "]/div[3]/div[2]/pre[@class=‘content‘]";
166 // 获取说说内容
167 try {
168 WebElement element = driver.findElement(By.xpath(xpath));
169 String text = element.getText();
170 System.out.println("本页第" + (j + 1) + "条 :" + text);
171 fileWriter.write(text, 0, text.length());
172
173 } catch (Exception e) {
174 e.printStackTrace();
175 } finally {
176
177 }
178 if (j % 2 == 0) {
179 actions.sendKeys(Keys.ARROW_DOWN).perform();
180 }
181 }
182 System.out.println("第" + (i + 1) + "页说说爬取完毕");
183 // 分页
184 if ((i + 2) <= totalPage) {
185 driver.findElement(By.xpath("//a[@id=‘pager_num_" + i + "_" + (i + 2) + "‘]")).click();
186 // 等待页面加载
187 Thread.sleep(3000);
188 }
189 }
190
191 if (fileWriter != null) {
192 fileWriter.close();
193 }
194 }
195
196 }
比爬取相册简单点,唯一有点弯的是页码的构造了,我写的这个只支持获取文字,可以用来生成词云
以上是关于selenium 爬取空间说说的主要内容,如果未能解决你的问题,请参考以下文章