项目介绍
用于豆瓣热映电影数据爬取,爬取电影影视人员信息、剧情简介、电影影评等数据。
技术选型
使用 selenium 节点、循环节点、自定义 js 函数
流程源码
注意:selenium节点中需要配置远程驱动地址
json
{
"nodeList": [
{
"nodeId": "rra9tddz81nf3fz2qvpg",
"left": "73px",
"top": "83px",
"class": "workflow-center-clone",
"name": "start",
"label": "开始",
"form": [
{
"labelName": "最大线程数",
"componentType": "EL_NUMBER_INPUT",
"dataType": "INT",
"propName": "threadCount",
"placeholder": "请输入爬虫最大线程数",
"value": 4,
"attributes": {
"min": 4,
"max": 12
},
"childrenItem": null,
"required": false
}
],
"icon": "ele-Flag",
"type": "node"
},
{
"nodeId": "6j08gpwqrwh8z0spuama",
"left": "248px",
"top": "23px",
"class": "workflow-center-clone",
"name": "selenium",
"label": "Selenium",
"form": [
{
"labelName": "节点变量",
"componentType": "EL_INPUT",
"dataType": "STRING",
"propName": "nodeVariableName",
"placeholder": "请输入节点变量",
"value": "resp",
"attributes": null,
"childrenItem": null,
"required": false
},
{
"labelName": "Cookie自动管理",
"componentType": "EL_SWITCH",
"dataType": "BOOLEAN",
"propName": "cookie-auto-set",
"placeholder": null,
"value": true,
"attributes": null,
"childrenItem": null,
"required": false
},
{
"labelName": "启动参数",
"componentType": "CUSTOM_MULT_KEY_VALUE",
"dataType": "LIST_MAP",
"propName": "extConfig",
"placeholder": "请选择启动参数",
"value": [],
"attributes": null,
"childrenItem": [
{
"labelName": "最大窗口",
"value": "--start-maximized",
"dataType": "STRING"
},
{
"labelName": "无头模式",
"value": "headless",
"dataType": "STRING"
}
],
"required": false
},
{
"labelName": "循环变量",
"componentType": "EL_INPUT",
"dataType": "STRING",
"propName": "loopVariableName",
"placeholder": "请输入循环变量",
"value": "",
"attributes": null,
"childrenItem": null,
"required": false
},
{
"labelName": "循环次数",
"componentType": "EL_NUMBER_INPUT",
"dataType": "INT",
"propName": "loopCount",
"placeholder": "请输入循环次数",
"value": 1,
"attributes": {
"min": 1
},
"childrenItem": null,
"required": false
},
{
"labelName": "页面加载超时时间(ms)",
"componentType": "EL_INPUT",
"dataType": "INT",
"propName": "pageLoadTimeout",
"placeholder": "请输入页面加载超时时间",
"value": 100000,
"attributes": null,
"childrenItem": null,
"required": false
},
{
"labelName": "元素获取超时时间",
"componentType": "EL_INPUT",
"dataType": "INT",
"propName": "implicitlyWaitTimeout",
"placeholder": "请输入元素获取超时时间",
"value": 5000,
"attributes": null,
"childrenItem": null,
"required": false
},
{
"labelName": "驱动类型",
"componentType": "EL_SELECT",
"dataType": "STRING",
"propName": "driverType",
"placeholder": "请选择驱动类型",
"value": "chrome",
"attributes": null,
"childrenItem": [
{
"labelName": "chrome",
"value": "chrome",
"dataType": "STRING"
},
{
"labelName": "firefox",
"value": "firefox",
"dataType": "STRING"
}
],
"required": false
},
{
"labelName": "请求地址",
"componentType": "EL_INPUT",
"dataType": "STRING",
"propName": "url",
"placeholder": "请输入请求地址(url)",
"value": "https://movie.douban.com/",
"attributes": null,
"childrenItem": null,
"required": false
},
{
"labelName": "携带session请求网址",
"componentType": "CUSTOM_MULT_KEY_VALUE",
"dataType": "LIST_MAP",
"propName": "header-session",
"placeholder": "请输入携带的session",
"value": [],
"attributes": null,
"childrenItem": null,
"required": false
},
{
"labelName": "远程驱动地址",
"componentType": "EL_INPUT",
"dataType": "STRING",
"propName": "remote-webdriver-url",
"placeholder": "请输入远程驱动地址",
"value": "http://127.0.0.1:9515",
"attributes": null,
"childrenItem": null,
"required": false
}
],
"icon": "ele-ChromeFilled",
"type": "node"
},
{
"nodeId": "c9fjlmbl6p3anebwk2ut",
"left": "469px",
"top": "25px",
"class": "workflow-center-clone",
"name": "function",
"label": "函数",
"form": [
{
"labelName": "函数",
"componentType": "CUSTOM_MULT_VALUE",
"dataType": "LIST_MAP",
"propName": "function",
"placeholder": null,
"value": [
{
"remark": "点击实战",
"value": "${resp.elementToBeClickable('/html/body/div[3]/div[1]/div/div[2]/div[1]/div[1]/h2/span[1]/a').click()}",
"key": ""
},
{
"remark": "获取正在上映电影节点",
"value": "${resp.xpaths('/html/body/div[3]/div[1]/div/div[1]/div[3]/div[2]/ul/li')}"
}
],
"attributes": null,
"childrenItem": null,
"required": false
}
],
"icon": "iconfont icon-terminal",
"type": "node"
},
{
"nodeId": "q7ctqdorw3tsaktr1gqb",
"left": "649px",
"top": "75px",
"class": "workflow-center-clone",
"name": "variable",
"label": "变量",
"form": [
{
"labelName": "变量列表",
"componentType": "CUSTOM_MULT_KEY_VALUE",
"dataType": "LIST_MAP",
"propName": "variable",
"placeholder": null,
"value": [
{
"remark": "电影详情页",
"value": "https://movie.douban.com/subject/",
"key": "movieUrl"
},
{
"remark": "获取element元素",
"value": "${resp.xpaths('/html/body/div[3]/div[1]/div/div[1]/div[3]/div[2]/ul/li')}",
"key": "elementLi"
}
],
"attributes": null,
"childrenItem": null,
"required": false
}
],
"icon": "ele-Share",
"type": "node"
},
{
"nodeId": "awaatp13q6a03oiblvfz",
"left": "565px",
"top": "269px",
"class": "workflow-center-clone",
"name": "loop",
"label": "循环",
"form": [
{
"labelName": "开始下标",
"componentType": "EL_INPUT",
"dataType": "INT",
"propName": "loopStartIndex",
"placeholder": "请输入开始下标",
"value": 0,
"attributes": null,
"childrenItem": null,
"required": false
},
{
"labelName": "结束下标",
"componentType": "EL_INPUT",
"dataType": "INT",
"propName": "loopEndIndex",
"placeholder": "请输入结束下标",
"value": 61,
"attributes": null,
"childrenItem": null,
"required": false
},
{
"labelName": "循环次数",
"componentType": "EL_INPUT",
"dataType": "STRING",
"propName": "loopCount",
"placeholder": "请输入循环次数(数字或表达式)",
"value": "${elementLi.size()}",
"attributes": null,
"childrenItem": null,
"required": false
},
{
"labelName": "循环下标",
"componentType": "EL_INPUT",
"dataType": "STRING",
"propName": "loopIndex",
"placeholder": "请输入循环下标名称",
"value": "loopIndex",
"attributes": null,
"childrenItem": null,
"required": false
}
],
"icon": "ele-Refresh",
"type": "node"
},
{
"nodeId": "i2kxpmoyyhaohjmrryci",
"left": "278px",
"top": "283px",
"class": "workflow-center-clone",
"name": "variable",
"label": "变量",
"form": [
{
"labelName": "变量列表",
"componentType": "CUSTOM_MULT_KEY_VALUE",
"dataType": "LIST_MAP",
"propName": "variable",
"placeholder": null,
"value": [
{
"remark": "获取当前li标签Id",
"value": "${resp.xpaths('/html/body/div[3]/div[1]/div/div[1]/div[3]/div[2]/ul/li')[loopIndex].getAttribute('id')}",
"key": "getLi"
},
{
"remark": "拼接详情页网址",
"value": "https://movie.douban.com/subject/${getLi}",
"key": "movieUrl"
},
{
"remark": "跳转电影详情",
"value": "${resp.toUrl(movieUrl)}",
"key": "goMovieUrl"
},
{
"remark": "获取电影详情",
"value": "${resp.xpaths('/html/body/div[3]/div[1]/div/div[1]/div[1]/div[1]/div[1]/div[2]/span').text()}",
"key": "movieInfo"
},
{
"remark": "电影简介",
"value": "${resp.xpath('//*[@id=\"link-report-intra\"]/span').text()}",
"key": "briefIntroduction"
},
{
"remark": "电影影评",
"value": "${resp.xpaths('/html/body/div[3]/div[1]/div/div[1]/section/div[2]/div')}",
"key": "comments"
},
{
"remark": "",
"value": "${douban(comments,resp)}",
"key": "comments"
},
{
"value": "${resp.toUrl(\"https://movie.douban.com/cinema/nowplaying\")}",
"key": "back",
"remark": "后退"
}
],
"attributes": null,
"childrenItem": null,
"required": false
}
],
"icon": "ele-Share",
"type": "node"
}
],
"lineList": [
{
"sourceId": "c9fjlmbl6p3anebwk2ut",
"targetId": "q7ctqdorw3tsaktr1gqb",
"label": "",
"exceptionFlow": 2,
"condition": "",
"transmitVariable": true
},
{
"sourceId": "awaatp13q6a03oiblvfz",
"targetId": "i2kxpmoyyhaohjmrryci",
"label": "",
"exceptionFlow": 2,
"condition": "",
"transmitVariable": true,
"type": "line"
},
{
"sourceId": "rra9tddz81nf3fz2qvpg",
"targetId": "6j08gpwqrwh8z0spuama",
"label": "",
"exceptionFlow": 2,
"condition": "",
"transmitVariable": true
},
{
"sourceId": "6j08gpwqrwh8z0spuama",
"targetId": "c9fjlmbl6p3anebwk2ut",
"label": "",
"exceptionFlow": 2,
"condition": "",
"transmitVariable": true
},
{
"sourceId": "q7ctqdorw3tsaktr1gqb",
"targetId": "awaatp13q6a03oiblvfz",
"label": "",
"exceptionFlow": 2,
"condition": "",
"transmitVariable": true
}
]
}
自定义函数
注意:上述流程源码中,定义自定义函数名称:douban,函数参数:elements,resp
JavaScript
var ArrayList = Java.type('java.util.ArrayList');
var By = Java.type('org.openqa.selenium.By');
var arrayList = new ArrayList();
for(i = 0; i <= elements.size() -1; i++) {
var headElement = resp.getDriver().findElements(By.className("main-hd"));
var contextElement = resp.getDriver().findElements(By.className("main-bd"));
for (j = 0; j <= contextElement.size() -1; j++) {
var head = headElement.get(j).getText();
var context = contextElement.get(j).getText();
var innerHTML = head + "//" + context;
print("innerHTML: " + innerHTML);
arrayList.add(innerHTML);
}
}
return arrayList;