Phantom/Request
PhantomJS / Request 爬虫入门
以可备案域名后缀查询为例.
PhantomJS 优势为: 可以模拟页面渲染(执行 js).
request 优势就是效率高咯.
phantom 示例
思路很简单:
- 打开首页
- 抓取数据
- 模拟点击下一页
- 重复第 2 步,直到没有数据
const driver = require('node-phantom-simple');
const phantom = require('phantomjs-prebuilt');
driver.create(
{
path: phantom.path
},
(err, browser) => {
browser.createPage((err2, page) => {
page.set(
'settings.userAgent',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3175.4 Safari/537.36'
);
page.open('URL地址,和谐你我他', (err3, status) => {
console.log('opened site? ', status);
/* global nextPage,previousPage */
let n = 0;
const domains = [];
const fib = () => {
// eslint-disable-next-line
page.evaluate(
function () {
/* eslint-disable */
var i = 1;
var ele;
var result = [];
while ((ele = document.getElementById(i))) {
result.push(ele.children[3].innerText.trim());
i++;
}
return {
data: result,
cur: ~~previousPage.toString().split(' = ')[2].split(';')[0] + 1
};
},
(err4, result) => {
/* eslint-enable */
const { data, cur } = result;
console.log('page %d done', cur);
if (cur === n + 1) {
domains.push(...data);
n += 1;
}
if (data.length === 0) {
console.log('-------');
console.log('total domains:', domains.length);
const arr = [...new Set(domains)];
console.log('unique domains:', arr.length);
console.log('-------');
console.log(arr.sort().join('\n'));
browser.exit();
process.exit();
}
}
);
setTimeout(() => {
// eslint-disable-next-line
page.evaluate(
function () {
nextPage();
},
() => {
setTimeout(fib, 3000);
}
);
}, 3000);
};
fib();
});
});
}
);
其中几个注意点:
- User-Agent 必须要设置
- Timeout 是经过试验得出的较优的方案
这些都是为了绕过知道创宇的反爬虫机制.
Request 示例
在这个示例里, 不推荐使用 phantom, 因为这样的界面上并没有动态的内容, 而且页面间通过传统表单形式进行跳转, 这就有一些可以利用的空间了.
比如pagesize
默认选项只有三种: 5, 10, 20. 但经过测试, 设置 1000 也能正常获取数据. 所以我们这里就直接设置 1000,一次性搞定.
const request = require('request');
const cheerio = require('cheerio');
const gbk = require('gbk');
request(
{
url: 'URL地址,和谐你我他',
method: 'POST',
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3175.4 Safari/537.36'
},
form: {
domainName: '',
domainBlur: 0,
domainType: 0,
'page.pageSize': 1000,
pageNo: 1,
jumpPageNo: ''
},
encoding: null
},
(err, httpResponse, body) => {
const $ = cheerio.load(gbk.toString('utf-8', body));
const domains = [];
$('tr[id]').each((i, ele) => {
domains.push($(ele).children('td').eq(3).text().trim());
});
console.log('-------');
console.log('total domains:', domains.length);
const arr = [...new Set(domains)];
console.log('unique domains:', arr.length);
console.log('-------');
console.log(arr.sort().join('\n'));
process.exit();
}
);
完整的项目源码位于: https://github.com/willin/beian-domain/tree/develop