0%

网络爬虫逆向(1688工厂)

目标网址:

https://sale.1688.com/factory/category.html?spm=a260k.22464671.home2019category.1.69517a6eFclvLv&mainId=10166

爬取目标:

  • 工厂名称
  • 地点等等

image-20230412100533504

网页解析

打开开发者工具,按F12刷新网页,然后再网络中搜索字段深圳市龙岗区馨衣舍服饰厂

image-20230412100701890

发现可以搜索到对应的数据包。

打开数据包,点击预览查看数据。

image-20230412100755166

在该数据包中能够找到对应目标数据。

接下来只需要能够顺利请求到该数据包即可。

点击该数据包的标头查看数据包的请求方式。

image-20230412100910804

发现该数据包的请求信息如下:

请求API:https://h5api.m.1688.com/h5/mtop.taobao.widgetservice.getjsoncomponent/1.0/

请求方法:GET请求

接下来查看该数据包在发送GET请求时需要传输的参数。

image-20230412101044717

初步看了一下,该数据包传输的参数较多,为了便于后续的进一步分析,还需要搞清楚这些参数中哪些是固定的,哪些是变化的。

刷新网页再次查看该数据包的参数。

image-20230412101211278

经过对比发现两次请求中只有参数tsign发生了变化,其他内容都是固定值。

t很明显是一个13位的时间戳数据,sign则需要进一步分析。

在网络数据包列表中找到该数据包,然后点击启动器,进入JS文件。

image-20230412101642172

在JS文件中搜索sign

image-20230412101738542

找到图中所示内容,在j附近打上断点。刷新网页。

可以看到sign=jj是由后面的内容拼接而成后作用在h函数上。

拼接的内容为:d.token + "&" + i + "&" + g + "&" + c.data

其中:

  • d.token为固定值39fa28587fbbb780591c41b82f10768d

image-20230412102048755

  • i为13位时间戳

image-20230412102159612

  • g为固定值12574478

image-20230412102225158

  • c.data为固定值,该值刚开始的时候会有一个初始值,在后续生成j的时候会进一步发生变化。

image-20230412102501283

点击继续执行脚本,直到j的内容被完全生成。

image-20230412102527317

这个时候再查看c.data的值。

image-20230412102623904

在控制台中打印出c.data

'{"cid":"TpFacRecommendService:TpFacRecommendService","methodName":"execute","params":"{\\"query\\":\\"mainCate=10166&leafCate=\\",\\"sort\\":\\"mix\\",\\"pageNo\\":\\"1\\",\\"pageSize\\":\\"20\\",\\"from\\":\\"PC\\",\\"trafficSource\\":\\"pc_index_recommend\\"}"}'

接下来使用Python代码模拟该数据生成。

1
2
3
4
5
6
7
8
9
10
import time

# 模拟sign参数
token = '39fa28587fbbb780591c41b82f10768d'
i = int(time.time() * 1000)
g = '12574478'
data = '{"cid":"TpFacRecommendService:TpFacRecommendService","methodName":"execute","params":"{\\"query\\":\\"mainCate=10166&leafCate=\\",\\"sort\\":\\"mix\\",\\"pageNo\\":\\"1\\",\\"pageSize\\":\\"20\\",\\"from\\":\\"PC\\",\\"trafficSource\\":\\"pc_index_recommend\\"}"}'

sign_key = token + "&" + str(i) + "&" + g + "&" + data
print(sign_key)
1
98db12d56bb5e26159c2c0e16743d0c0&1681266707233&12574478&{"cid":"TpFacRecommendService:TpFacRecommendService","methodName":"execute","params":"{\"query\":\"mainCate=10166&leafCate=\",\"sort\":\"mix\",\"pageNo\":\"1\",\"pageSize\":\"20\",\"from\":\"PC\",\"trafficSource\":\"pc_index_recommend\"}"}

接下来再看h函数,光标放到h函数上方点击函数路径进入函数。

image-20230412102837441

新建JS文件,将h函数内容复制到JS文件中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
function h(a) {
function b(a, b) {
return a << b | a >>> 32 - b
}
function c(a, b) {
var c, d, e, f, g;
return e = 2147483648 & a,
f = 2147483648 & b,
c = 1073741824 & a,
d = 1073741824 & b,
g = (1073741823 & a) + (1073741823 & b),
c & d ? 2147483648 ^ g ^ e ^ f : c | d ? 1073741824 & g ? 3221225472 ^ g ^ e ^ f : 1073741824 ^ g ^ e ^ f : g ^ e ^ f
}
function d(a, b, c) {
return a & b | ~a & c
}
function e(a, b, c) {
return a & c | b & ~c
}
function f(a, b, c) {
return a ^ b ^ c
}
function g(a, b, c) {
return b ^ (a | ~c)
}
function h(a, e, f, g, h, i, j) {
return a = c(a, c(c(d(e, f, g), h), j)),
c(b(a, i), e)
}
function i(a, d, f, g, h, i, j) {
return a = c(a, c(c(e(d, f, g), h), j)),
c(b(a, i), d)
}
function j(a, d, e, g, h, i, j) {
return a = c(a, c(c(f(d, e, g), h), j)),
c(b(a, i), d)
}
function k(a, d, e, f, h, i, j) {
return a = c(a, c(c(g(d, e, f), h), j)),
c(b(a, i), d)
}
function l(a) {
for (var b, c = a.length, d = c + 8, e = (d - d % 64) / 64, f = 16 * (e + 1), g = new Array(f - 1), h = 0, i = 0; c > i; )
b = (i - i % 4) / 4,
h = i % 4 * 8,
g[b] = g[b] | a.charCodeAt(i) << h,
i++;
return b = (i - i % 4) / 4,
h = i % 4 * 8,
g[b] = g[b] | 128 << h,
g[f - 2] = c << 3,
g[f - 1] = c >>> 29,
g
}
function m(a) {
var b, c, d = "", e = "";
for (c = 0; 3 >= c; c++)
b = a >>> 8 * c & 255,
e = "0" + b.toString(16),
d += e.substr(e.length - 2, 2);
return d
}
function n(a) {
a = a.replace(/\r\n/g, "\n");
for (var b = "", c = 0; c < a.length; c++) {
var d = a.charCodeAt(c);
128 > d ? b += String.fromCharCode(d) : d > 127 && 2048 > d ? (b += String.fromCharCode(d >> 6 | 192),
b += String.fromCharCode(63 & d | 128)) : (b += String.fromCharCode(d >> 12 | 224),
b += String.fromCharCode(d >> 6 & 63 | 128),
b += String.fromCharCode(63 & d | 128))
}
return b
}
var o, p, q, r, s, t, u, v, w, x = [], y = 7, z = 12, A = 17, B = 22, C = 5, D = 9, E = 14, F = 20, G = 4, H = 11, I = 16, J = 23, K = 6, L = 10, M = 15, N = 21;
for (a = n(a),
x = l(a),
t = 1732584193,
u = 4023233417,
v = 2562383102,
w = 271733878,
o = 0; o < x.length; o += 16)
p = t,
q = u,
r = v,
s = w,
t = h(t, u, v, w, x[o + 0], y, 3614090360),
w = h(w, t, u, v, x[o + 1], z, 3905402710),
v = h(v, w, t, u, x[o + 2], A, 606105819),
u = h(u, v, w, t, x[o + 3], B, 3250441966),
t = h(t, u, v, w, x[o + 4], y, 4118548399),
w = h(w, t, u, v, x[o + 5], z, 1200080426),
v = h(v, w, t, u, x[o + 6], A, 2821735955),
u = h(u, v, w, t, x[o + 7], B, 4249261313),
t = h(t, u, v, w, x[o + 8], y, 1770035416),
w = h(w, t, u, v, x[o + 9], z, 2336552879),
v = h(v, w, t, u, x[o + 10], A, 4294925233),
u = h(u, v, w, t, x[o + 11], B, 2304563134),
t = h(t, u, v, w, x[o + 12], y, 1804603682),
w = h(w, t, u, v, x[o + 13], z, 4254626195),
v = h(v, w, t, u, x[o + 14], A, 2792965006),
u = h(u, v, w, t, x[o + 15], B, 1236535329),
t = i(t, u, v, w, x[o + 1], C, 4129170786),
w = i(w, t, u, v, x[o + 6], D, 3225465664),
v = i(v, w, t, u, x[o + 11], E, 643717713),
u = i(u, v, w, t, x[o + 0], F, 3921069994),
t = i(t, u, v, w, x[o + 5], C, 3593408605),
w = i(w, t, u, v, x[o + 10], D, 38016083),
v = i(v, w, t, u, x[o + 15], E, 3634488961),
u = i(u, v, w, t, x[o + 4], F, 3889429448),
t = i(t, u, v, w, x[o + 9], C, 568446438),
w = i(w, t, u, v, x[o + 14], D, 3275163606),
v = i(v, w, t, u, x[o + 3], E, 4107603335),
u = i(u, v, w, t, x[o + 8], F, 1163531501),
t = i(t, u, v, w, x[o + 13], C, 2850285829),
w = i(w, t, u, v, x[o + 2], D, 4243563512),
v = i(v, w, t, u, x[o + 7], E, 1735328473),
u = i(u, v, w, t, x[o + 12], F, 2368359562),
t = j(t, u, v, w, x[o + 5], G, 4294588738),
w = j(w, t, u, v, x[o + 8], H, 2272392833),
v = j(v, w, t, u, x[o + 11], I, 1839030562),
u = j(u, v, w, t, x[o + 14], J, 4259657740),
t = j(t, u, v, w, x[o + 1], G, 2763975236),
w = j(w, t, u, v, x[o + 4], H, 1272893353),
v = j(v, w, t, u, x[o + 7], I, 4139469664),
u = j(u, v, w, t, x[o + 10], J, 3200236656),
t = j(t, u, v, w, x[o + 13], G, 681279174),
w = j(w, t, u, v, x[o + 0], H, 3936430074),
v = j(v, w, t, u, x[o + 3], I, 3572445317),
u = j(u, v, w, t, x[o + 6], J, 76029189),
t = j(t, u, v, w, x[o + 9], G, 3654602809),
w = j(w, t, u, v, x[o + 12], H, 3873151461),
v = j(v, w, t, u, x[o + 15], I, 530742520),
u = j(u, v, w, t, x[o + 2], J, 3299628645),
t = k(t, u, v, w, x[o + 0], K, 4096336452),
w = k(w, t, u, v, x[o + 7], L, 1126891415),
v = k(v, w, t, u, x[o + 14], M, 2878612391),
u = k(u, v, w, t, x[o + 5], N, 4237533241),
t = k(t, u, v, w, x[o + 12], K, 1700485571),
w = k(w, t, u, v, x[o + 3], L, 2399980690),
v = k(v, w, t, u, x[o + 10], M, 4293915773),
u = k(u, v, w, t, x[o + 1], N, 2240044497),
t = k(t, u, v, w, x[o + 8], K, 1873313359),
w = k(w, t, u, v, x[o + 15], L, 4264355552),
v = k(v, w, t, u, x[o + 6], M, 2734768916),
u = k(u, v, w, t, x[o + 13], N, 1309151649),
t = k(t, u, v, w, x[o + 4], K, 4149444226),
w = k(w, t, u, v, x[o + 11], L, 3174756917),
v = k(v, w, t, u, x[o + 2], M, 718787259),
u = k(u, v, w, t, x[o + 9], N, 3951481745),
t = c(t, p),
u = c(u, q),
v = c(v, r),
w = c(w, s);
var O = m(t) + m(u) + m(v) + m(w);
return O.toLowerCase()
}

然后使用Python调用该JS文件sign_key进行解析。

1
2
3
4
5
6
7
import execjs

# 读取JS文件
with open('demo1_sign加密.js', 'r', encoding='UTF-8') as f:
jscode = f.read()
sign = execjs.compile(jscode).call('h', sign_key)
print(sign)
1
9ab892292b22a6e1980824595087b7cd

生成的sign9ab892292b22a6e1980824595087b7cd

为了验证算法的正确性,可以把时间戳修改为和网页上的一致,将代码生成的sign与网页进行对比。

image-20230412103639911

将时间戳修改为1681266229197,再次运行脚本,查看生成的sign

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import time
import execjs

# 模拟sign参数
token = '39fa28587fbbb780591c41b82f10768d'
# i = int(time.time() * 1000)
i = 1681266229197
g = '12574478'
data = '{"cid":"TpFacRecommendService:TpFacRecommendService","methodName":"execute","params":"{\\"query\\":\\"mainCate=10166&leafCate=\\",\\"sort\\":\\"mix\\",\\"pageNo\\":\\"1\\",\\"pageSize\\":\\"20\\",\\"from\\":\\"PC\\",\\"trafficSource\\":\\"pc_index_recommend\\"}"}'

sign_key = token + "&" + str(i) + "&" + g + "&" + data

# 读取JS文件
with open('demo1_sign加密.js', 'r', encoding='UTF-8') as f:
jscode = f.read()
sign = execjs.compile(jscode).call('h', sign_key)
print(sign)
1
cbf688f6c9631f914ca412b1aef031e4

发现与网页中的sign一致。

网页数据采集

获取到sign数据后就可以直接发送GET请求去获取数据了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# 接下来使用sign参数获取数据
import requests

# 请求API
url = 'https://h5api.m.1688.com/h5/mtop.taobao.widgetservice.getjsoncomponent/1.0/'

params = {
'jsv': '2.6.1',
'appKey': '12574478',
't': i,
'sign': sign_key,
'v': '1.0',
'type': 'jsonp',
'isSec': 0,
'timeout': 20000,
'api': 'mtop.taobao.widgetService.getJsonComponent',
'dataType': 'jsonp',
'jsonpIncPrefix': 'mboxfc',
'callback': 'mtopjsonpmboxfc3',
'data': data,
}

headers = {
'cookie': 'lid=luo736704198; alicnweb=touch_tb_at%3D1680934612278; _m_h5_tk=98db12d56bb5e26159c2c0e16743d0c0_1681220346823; _m_h5_tk_enc=fb7429c7b227adfe55c512741839e415; cookie1=BxSpHzb8GFsmSZHfbbB2adm%2FkInUN6kmS5LxVyBbwTk%3D; cookie2=166d0849c545842eeffd74dfaf147c9c; cookie17=UUGiHQt0GQ%2BU%2FQ%3D%3D; sgcookie=E10075e0oW6bDw5X8dlaQDOiIR0vgWI%2BpmsqdF%2FlAo9M6UlMoIqVMOxCAs7TF5EHvCnYNssyu8aagDM0vx7nDQHHjLvE801aXHKdHiP9xlB2Zck%3D; t=73833a3ac4c852d0f5656f3778fb2fff; _tb_token_=333efe5eee54b; sg=85d; csg=5e2d3c90; unb=2906023675; uc4=nk4=0%40DfV%2FByDhGKnRdwQiYyWpvjxAjEKwc6o%3D&id4=0%40U2OVFc7GR%2FBF8IuOJ%2FZPP0%2Fk2IT8; __cn_logon__=true; __cn_logon_id__=luo736704198; ali_apache_track=c_mid=b2b-2906023675ae282|c_lid=luo736704198|c_ms=1; ali_apache_tracktmp=c_w_signed=Y; _nk_=luo736704198; last_mid=b2b-2906023675ae282; _csrf_token=1681213129057; __mwb_logon_id__=luo736704198; mwb=ng; x5sec=7b226d746f703b32223a22656661303666666434383563396433376463376165353361303661353332303243494b5131614547454e663671746134745a587068774561444449354d4459774d6a4d324e7a55374d544362396657592f762f2f2f2f384251414d3d227d; cna=6QwQGjcvqQ0CAatxtqzgPaAk; l=fBMhOOe4NSPHmkzvXOfaFurza77OSIRYYuPzaNbMi9fP9k1B5yUcW1g7-A86C3GVFsdkR3o-P8F6BeYBq3xonxvtjsx8SPDmndLHR35..; isg=BAUFck8ezbE5senoi1gnyHIBFEE_wrlUWmfejgdqwTxLniUQzxLJJJN4qMJo3tEM',
'referer': 'https://sale.1688.com/',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}

res = requests.get(url, params=params, headers=headers)
print(res.text)

image-20230412110803684

现在已经能够正常请求数据。

-------------本文结束感谢您的阅读-------------