node爬虫之gbk网页中文乱码解决方案

中文乱码具体是指用 node 请求 gb2312 编码的网页,无法正确获取网页中的中文(需要转码)。

解决办法

直接用 iconv-lite 模块进行转码。

iconv-lite 是一个进行编码转换的模块(node 默认编码 utf-8)。需要 decode 的编码必须是 Buffer 类型。

使用http模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
const http = require("http");
const querystring = require("querystring");
const iconv = require("iconv-lite");
let postData = {
username:"username"
};
let req = http.request({
hostname: "xxxx.com",
port: 80,
path: "/pathname",
method: "POST",
headers: {
"Content-Type": "application/x-www-form-urlencoded"
}
}, (res) => {
let chunks = [];
res.on("data", (chunk) => {
chunks.push(chunk);
});
res.on("end", () => {
let html = iconv.decode(Buffer.concat(chunks), "gb2312");
});
});
req.on("error", (e) => {
console.error(`problem with request: ${e.message}`);
});

// write data to request body
req.write(querystring.stringify(postData));
req.end();

使用axios

1
2
3
4
5
6
const iconv = require("iconv-lite");
const axios = require("axios");
axios.get(`url`, { responseType: "arraybuffer" }).then(function(response){
let html = iconv.decode(response.data, "gb2312");
console.log(html);
})