想把我最近做的一个分布式网络爬虫系统开源,不知道有人有兴趣不

50 天前
 guoguobaba

WebRPA: 分布式网络爬虫

前言

webrpa 是一个分布式的网络爬虫系统,基于 fastapi+fastadmin 开发,通过 web api 接口发起网络爬虫服务,实现流程自动化或数据自动抓取。它包含两部分:

graph LR
client-->manager-->worker1
manager-->worker2
manager-->workers[worker...]

主要实现的功能包括:

TODO

引入 browser use ,通过 LLM 自动创建数据爬虫服务。

一个示例,爬取某网站

{
  "name": "szreorc",
  "desc": "深圳不动产查询",
  "driver": "firefox",
  "url": "",
  "debug": true,
  "window_size": "1920x1080",
  "action_timeout": 5,
  "wait_redirect": true,
  "wait_redirect_interval": 2,
  "identifier": "{username}-{BuildingName}-{UNIT_NO}",
  "credential": "{username}",
  "actions": {
    "1": {
      "desc": "确认登录",
      "action": "check_variable",
      "options": {"script": "return window.location.href;",
        "target": "^https://pnr.sz.gov.cn/d-ghrer/reroosp/ytcf"
      }
    },
    "10": {
      "desc" : "用户名密码登录",
      "action": "click",
      "timeout": 2,
      "target": ["xpath", "//a[contains(@class, 'login-tab') and normalize-space(text())='账号密码']"]
    },
    "11": {
      "desc" : "输入用户名",
      "action": "input_text",
      "target": ["xpath", "//input[@type='text' and @placeholder='请输入账号']"],
      "param": "username"
    },
    "12": {
      "desc": "增加计数",
      "action": "variable",
      "options": {"variable":"counter1","operator": "+"}
    },
    "13": {
      "desc": "检测计数",
      "action": "variable",
      "stop_on_fail": true,
      "options": {"variable":"counter1","operator": "<", "target": 2, "sleep": 2000}
    },
    "14": {
      "desc" : "输入密码",
      "action": "input_text",
      "target": ["xpath", "//input[@type='password' and @placeholder='请输入密码']"],
      "param": "password"
    },
    "15": {
      "desc": "识别 captcha",
      "action": "decode_captcha_code",
      "target": ["xpath","//div[contains(@class, 'captcha-body') and @title='点击刷新']"],
      "options": {"code_type": 11}
    },
    "16": {
      "desc": "输入 captcha",
      "action": "input_text",
      "target": ["xpath","//div[contains(@class, 'account_verifying')] //input[@type='text']"]
    },
    "17": {
      "desc": "点击登录",
      "action": "click",
      "target": ["xpath", "//button[contains(@class, 'gd-btn-primary') and contains(@class, 'gd-btn') and @type='button']//span[starts-with(text(), '登录 ')]"]
    },
    "18": {
      "desc": "继续登录",
      "action": "click",
      "target": ["xpath", "//button[.//span[contains(text(), '继续登录')]]"]
    },
    "20": {
      "desc": "确认选择",
      "action": "click",
      "timeout": 10,
      "stop_on_fail": true,
      "fail_message": "login failed",
      "options": {"set_credential": true},
      "target": ["class name", "jinruxuzhi-checkbox"]
    },
    "21": {
      "desc": "确认选择下一步",
      "action": "click",
      "target": ["class name", "jinruxuzhi-buttonOk"]
    },
    "30": {
      "desc": "展开查询类型",
      "action": "click",
      "options": {"sleep": 2},
      "target": ["xpath", "//input[@type='text' and @placeholder='请选择']"]
    },
    "31": {
      "desc": "等待下拉菜单",
      "action": "wait_element",
      "options": {"visible":  true},
      "target": ["css selector", "div.el-select-dropdown.el-popper"]
    },
    "32": {
      "desc": "选择查询类型",
      "action": "click",
      "target": ["xpath", "//li[contains(@class, 'el-select-dropdown__item') and span[text()='楼名及栋名']]"]
    },
    "33": {
      "desc" : "输入查询内容",
      "action": "input_text",
      "target": ["xpath", "//input[@type='text' and @placeholder='请输入内容']"],
      "param": "BuildingName"
    },
    "34": {
      "desc": "点击查询",
      "action": "click",
      "target": ["class name", "el-icon-search"]
    },
    "35": {
      "desc": "点击截图对象",
      "action": "click",
      "timeout": 20,
      "stop_on_fail": true,
      "fail_message":  "search failed",
      "target": ["xpath", "//div[contains(@class, 'el-dialog__wrapper')]//div[contains(@class, 'el-tabs__item') and normalize-space(text())='楼宇']"]
    },

    "40": {
      "desc": "获取数据",
      "action": "get_data",
      "options": {"script": "var table = document.querySelector(\"#pane-1 table.is-bordered.el-descriptions--mini\");\nvar fields = [\"土地坐落\", \"楼名及栋名\", \"房屋类型\", \"房屋性质\", \"房屋用途\"];\nvar result = {};\nif (table) {\n    var rows = table.querySelectorAll(\"tr.el-descriptions-row\");\n    rows.forEach(function(row) {\n        var label = row.querySelector(\"th.el-descriptions-item__label\").innerText.trim();\n        var content = row.querySelector(\"td.el-descriptions-item__content\").innerText.trim();\n        if (fields.includes(label)) {\n            result[label] = content;\n        }\n    });\n    console.log(JSON.stringify(result));\n} else {\n    console.log(\"Table not found.\");\n};\nreturn result;\n"}
    },
    "41": {
      "desc": "点击截图对象",
      "action": "click",
      "target": ["xpath", "//div[contains(@class, 'el-dialog__wrapper')]//div[contains(@class, 'el-tabs__item') and normalize-space(text())='房屋']"]
    },
    "42": {
      "desc": "下拉房屋查询",
      "action": "click",
      "target": ["css selector", "#pane-2 input.el-input__inner"]
    },
    "43": {
      "desc": "点击房屋查询",
      "action": "click",
      "target": ["xpath", "//li[contains(@class, 'el-select-dropdown__item')]//span[text()='{UNIT_NO}']"],
      "param": "UNIT_NO"
    },
    "44": {
      "desc": "截图",
      "action": "screenshot",
      "target": ["class name", "el-dialog__wrapper"],
      "options": {"visible": true}
    },
    "45": {
      "desc": "获取数据",
      "action": "get_data",
      "options": {"script": "var table = document.querySelector(\"#pane-2 table.is-bordered.el-descriptions--mini\");\nvar fields = [\"房号\", \"所在楼层\", \"建筑面积\", \"使用年限\", \"存在抵押\", \"存在查封\", \"存在异议\", \"存在居住权\"];\nvar result = {};\nif (table) {\n    var rows = table.querySelectorAll(\"tr.el-descriptions-row\");\n    rows.forEach(function(row) {\n        var label = row.querySelector(\"th.el-descriptions-item__label\").innerText.trim();\n        var content = row.querySelector(\"td.el-descriptions-item__content\").innerText.trim();\n        if (fields.includes(label)) {\n            result[label] = content;\n        }\n    });\n    console.log(JSON.stringify(result));\n} else {\n    console.log(\"Table not found.\");\n};\nreturn result;\n"}
    }
  },
  "processes": "start->1\n1(no)->10->11\n11(no)->12->13\n13(yes)->10\n11(yes)->14->15->16->17->18->20->21->30->31->32->33->34->35->40->41->42->43->44->45->end\n1(yes)->20",
  "result":["screenshot", "data"]
}

2093 次点击
所在节点    程序员
5 条回复
hamwong
50 天前
滑动等更复杂的人机校验是怎么处理的🙋
5ssl
49 天前
能实现人机交互吗,厉害
Hermitist
49 天前
不知道 pinxixi 是否可以
Mast
48 天前
插眼,等开源试试看
bwnjnOEI
47 天前
能过京东淘宝突然弹出来的人机检测吗

这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。

https://www.v2ex.com/t/1171254

V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。

V2EX is a community of developers, designers and creative people.

© 2021 V2EX