|
1 | 1 | import requests
|
2 | 2 | import json
|
3 | 3 | import argparse
|
| 4 | +import csv |
4 | 5 | from argparse import RawTextHelpFormatter
|
5 | 6 | from datetime import datetime
|
6 | 7 |
|
7 |
| -# you may store your session cookie here persistently |
8 | 8 | LOGIN_COOKIE = "<INSERT-YOUR-XING-LOGIN-COOKIE-VALUE>"
|
9 | 9 |
|
10 |
| -# converting german umlauts |
11 | 10 | special_char_map = {ord('ä'):'ae', ord('ü'):'ue', ord('ö'):'oe', ord('ß'):'ss'}
|
12 | 11 |
|
13 | 12 | format_examples = '''
|
|
21 | 20 | parser = argparse.ArgumentParser("xingdumper.py", formatter_class=RawTextHelpFormatter)
|
22 | 21 | parser.add_argument("--url", metavar='<xing-url>', help="A XING company url - https://xing.com/pages/<company>", type=str, required=True)
|
23 | 22 | parser.add_argument("--count", metavar='<number>', help="Amount of employees to extract - max. 2999", type=int, required=False)
|
24 |
| -parser.add_argument("--cookie", metavar='<cookie>', help="XING 'login' cookie for authentication", type=str, required=False,) |
25 |
| -parser.add_argument("--full", help="Dump additional contact details (slow) - email, phone, fax, mobile", required=False, action='store_true') |
26 |
| -parser.add_argument("--quiet", help="Show employee results only", required=False, action='store_true') |
27 |
| -parser.add_argument("--email-format", help="Python string format for emails; for example:"+format_examples, required=False, type=str) |
| 23 | +parser.add_argument("--cookie", metavar='<cookie>', help="XING 'login' cookie for authentication", type=str, required=False) |
| 24 | +parser.add_argument("--full", help="Dump additional contact details (slow) - email, phone, fax, mobile", action='store_true') |
| 25 | +parser.add_argument("--email-format", help="Python string format for emails; for example:" + format_examples, metavar='<mail-format>', type=str) |
| 26 | +parser.add_argument("--output-json", help="Store results in json output file", metavar="<json-file>", type=str, required=False) |
| 27 | +parser.add_argument("--output-csv", help="Store results in csv output file", metavar="<csv-file>", type=str, required=False) |
28 | 28 |
|
29 | 29 | args = parser.parse_args()
|
30 | 30 | url = args.url
|
31 | 31 |
|
32 |
| -if (args.cookie): |
33 |
| - LOGIN_COOKIE = args.cookie |
| 32 | +if args.cookie: |
| 33 | + LOGIN_COOKIE = args.cookie |
34 | 34 |
|
35 |
| -if (args.email_format): |
36 |
| - mailformat = args.email_format |
37 |
| -else: |
38 |
| - mailformat = False |
39 |
| - |
40 |
| -if (args.count and args.count < 3000): |
41 |
| - count = args.count |
42 |
| -else: |
43 |
| - # according to XING, the result window must be less than 3000 |
44 |
| - count = 2999 |
| 35 | +mailformat = args.email_format if args.email_format else False |
| 36 | +count = args.count if args.count and args.count < 3000 else 2999 |
45 | 37 |
|
46 | 38 | api = "https://www.xing.com/xing-one/api"
|
47 | 39 | headers = {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'Content-type': 'application/json'}
|
48 | 40 | cookies_dict = {"login": LOGIN_COOKIE}
|
49 | 41 |
|
50 |
| -if (url.startswith('https://www.xing.com/pages/')): |
51 |
| - try: |
52 |
| - before_keyword, keyword, after_keyword = url.partition('pages/') |
53 |
| - company = after_keyword |
54 |
| - |
55 |
| - # retrieve company id from the api |
56 |
| - postdata1 = {"operationName":"EntitySubpage","variables":{"id":company,"moduleType":"employees"},"query":"query EntitySubpage($id: SlugOrID!, ) {\n entityPageEX(id: $id) {\n ... on EntityPage {\n slug\n title\n context {\n companyId\n }\n }\n }\n}\n"} |
57 |
| - r = requests.post(api, data=json.dumps(postdata1), headers=headers, cookies=cookies_dict, timeout=200) |
58 |
| - response1 = r.json() |
59 |
| - |
60 |
| - companyID = response1["data"]["entityPageEX"]["context"]["companyId"] |
61 |
| - |
62 |
| - # retrieve employee information from the api based on previously obtained company id |
63 |
| - postdata2 = {"operationName":"Employees","variables":{"consumer":"","id":companyID,"first":count,"query":{"consumer":"web.entity_pages.employees_subpage","sort":"CONNECTION_DEGREE"}},"query":"query Employees($id: SlugOrID!, $first: Int, $after: String, $query: CompanyEmployeesQueryInput!, $consumer: String! = \"\", $includeTotalQuery: Boolean = false) {\n company(id: $id) {\n id\n totalEmployees: employees(first: 0, query: {consumer: $consumer}) @include(if: $includeTotalQuery) {\n total\n }\n employees(first: $first, after: $after, query: $query) {\n total\n edges {\n node {\n profileDetails {\n id\n firstName\n lastName\n displayName\n gender\n pageName\n location {\n displayLocation\n }\n occupations {\n subline\n }\n }\n }\n }\n }\n }\n}\n"} |
64 |
| - r2 = requests.post(api, data=json.dumps(postdata2), headers=headers, cookies=cookies_dict, timeout=200) |
65 |
| - response2 = r2.json() |
66 |
| - |
67 |
| - if not args.quiet: |
68 |
| - |
69 |
| - print("""\ |
70 |
| -
|
71 |
| -▒██ ██▒ ██▓ ███▄ █ ▄████ ▓█████▄ █ ██ ███▄ ▄███▓ ██▓███ ▓█████ ██▀███ |
72 |
| -▒▒ █ █ ▒░▓██▒ ██ ▀█ █ ██▒ ▀█▒▒██▀ ██▌ ██ ▓██▒▓██▒▀█▀ ██▒▓██░ ██▒▓█ ▀ ▓██ ▒ ██▒ |
73 |
| -░░ █ ░▒██▒▓██ ▀█ ██▒▒██░▄▄▄░░██ █▌▓██ ▒██░▓██ ▓██░▓██░ ██▓▒▒███ ▓██ ░▄█ ▒ |
74 |
| - ░ █ █ ▒ ░██░▓██▒ ▐▌██▒░▓█ ██▓░▓█▄ ▌▓▓█ ░██░▒██ ▒██ ▒██▄█▓▒ ▒▒▓█ ▄ ▒██▀▀█▄ |
75 |
| -▒██▒ ▒██▒░██░▒██░ ▓██░░▒▓███▀▒░▒████▓ ▒▒█████▓ ▒██▒ ░██▒▒██▒ ░ ░░▒████▒░██▓ ▒██▒ |
76 |
| -▒▒ ░ ░▓ ░░▓ ░ ▒░ ▒ ▒ ░▒ ▒ ▒▒▓ ▒ ░▒▓▒ ▒ ▒ ░ ▒░ ░ ░▒▓▒░ ░ ░░░ ▒░ ░░ ▒▓ ░▒▓░ |
77 |
| -░░ ░▒ ░ ▒ ░░ ░░ ░ ▒░ ░ ░ ░ ▒ ▒ ░░▒░ ░ ░ ░ ░ ░░▒ ░ ░ ░ ░ ░▒ ░ ▒░ |
78 |
| - ░ ░ ▒ ░ ░ ░ ░ ░ ░ ░ ░ ░ ░ ░░░ ░ ░ ░ ░ ░░ ░ ░░ ░ |
79 |
| - ░ ░ ░ ░ ░ ░ ░ ░ ░ ░ by LRVT |
80 |
| - """) |
81 |
| - |
82 |
| - print("[i] Company Name: " + response1["data"]["entityPageEX"]["title"]) |
83 |
| - print("[i] Company X-ID: " + companyID) |
84 |
| - print("[i] Company Slug: " + company) |
85 |
| - print("[i] Dumping Date: " + datetime.now().strftime("%d/%m/%Y %H:%M:%S")) |
86 |
| - if mailformat: |
87 |
| - print("[i] Email Format: " + mailformat) |
88 |
| - print() |
89 |
| - |
90 |
| - if not mailformat: |
91 |
| - if args.full: |
92 |
| - legende = "Firstname;Lastname;Position;Gender;Location;E-Mail;Fax;Mobile;Phone;Profile" |
93 |
| - else: |
94 |
| - legende = "Firstname;Lastname;Position;Gender;Location;Profile" |
95 |
| - else: |
96 |
| - if args.full: |
97 |
| - legende = "Firstname;Lastname;Email;Position;Gender;Location;E-Mail;Fax;Mobile;Phone;Profile" |
98 |
| - else: |
99 |
| - legende = "Firstname;Lastname;Email;Position;Gender;Location;Profile" |
100 |
| - |
101 |
| - print(legende) |
102 |
| - |
103 |
| - dump_count = 0 |
104 |
| - |
105 |
| - # loop over employees |
106 |
| - for employee in response2['data']['company']['employees']['edges']: |
107 |
| - dump_count += 1 |
108 |
| - firstname = employee['node']['profileDetails']['firstName'] |
109 |
| - lastname = employee['node']['profileDetails']['lastName'] |
110 |
| - try: |
111 |
| - position = employee['node']['profileDetails']['occupations'][0]['subline'] |
112 |
| - except: |
113 |
| - position = "None" |
114 |
| - gender = employee['node']['profileDetails']['gender'] |
115 |
| - location = employee['node']['profileDetails']['location']['displayLocation'].replace('**','').replace(', ',',') |
116 |
| - pagename = employee['node']['profileDetails']['pageName'] |
117 |
| - |
118 |
| - if args.full: |
119 |
| - # dump additional contact details for each employee. Most often is "None", so no default api queries for this data |
120 |
| - postdata3 = {"operationName":"getXingId","variables":{"profileId":pagename},"query":"query getXingId($profileId: SlugOrID!, $actionsFilter: [AvailableAction!]) {\n profileModules(id: $profileId) {\n __typename\n xingIdModule(actionsFilter: $actionsFilter) {\n xingId {\n status {\n localizationValue\n __typename\n }\n __typename\n }\n __typename\n ...xingIdContactDetails\n }\n }\n}\n\nfragment xingIdContactDetails on XingIdModule {\n contactDetails {\n business {\n email\n fax {\n phoneNumber\n }\n mobile {\n phoneNumber\n }\n phone {\n phoneNumber\n }\n }\n __typename\n }\n __typename\n}\n"} |
121 |
| - r3 = requests.post(api, data=json.dumps(postdata3), headers=headers, cookies=cookies_dict, timeout=200) |
122 |
| - response3 = r3.json() |
123 |
| - try: |
124 |
| - # try to extract contact details |
125 |
| - email = response3['data']['profileModules']['xingIdModule']['contactDetails']['business']['email'] |
126 |
| - fax = response3['data']['profileModules']['xingIdModule']['contactDetails']['business']['fax']['phoneNumber'] |
127 |
| - mobile = response3['data']['profileModules']['xingIdModule']['contactDetails']['business']['mobile']['phoneNumber'] |
128 |
| - phone = response3['data']['profileModules']['xingIdModule']['contactDetails']['business']['phone']['phoneNumber'] |
129 |
| - except: |
130 |
| - # if contact details are missing in the API response, set to 'None' |
131 |
| - email = "None" |
132 |
| - fax = "None" |
133 |
| - mobile = "None" |
134 |
| - phone = "None" |
135 |
| - |
136 |
| - if not mailformat: |
137 |
| - # print employee information as Comma Separated Values (CSV) |
138 |
| - print(firstname + ";" + lastname + ";" + position + ";" + gender + ";" + location + ";" + str(email) + ";" + str(fax) + ";" + str(mobile) + ";" + str(phone) + ";" + "https://www.xing.com/profile/" + pagename) |
139 |
| - else: |
140 |
| - print(firstname + ";" + lastname + ";" + mailformat.format(firstname.lower().replace(".","").translate(special_char_map),lastname.lower().replace(".","").translate(special_char_map)) + ";" + position + ";" + gender + ";" + location + ";" + str(email) + ";" + str(fax) + ";" + str(mobile) + ";" + str(phone) + ";" + "https://www.xing.com/profile/" + pagename) |
141 |
| - else: |
142 |
| - if not mailformat: |
143 |
| - print(firstname + ";" + lastname + ";" + position + ";" + gender + ";" + location + ";" + "https://www.xing.com/profile/" + pagename) |
144 |
| - else: |
145 |
| - print(firstname + ";" + lastname + ";" + mailformat.format(firstname.lower().replace(".","").translate(special_char_map),lastname.lower().replace(".","").translate(special_char_map)) + ";" + position + ";" + gender + ";" + location + ";" + "https://www.xing.com/profile/" + pagename) |
146 |
| - |
147 |
| - if not args.quiet: |
148 |
| - print() |
149 |
| - print("[i] Successfully crawled " + str(dump_count) + " " + response1["data"]["entityPageEX"]["title"] + " employees. Hurray ^_-") |
150 |
| - |
151 |
| - except Exception as e: |
152 |
| - print() |
153 |
| - print("[!] Exception. Either API has changed and this script is broken or authentication failed.") |
154 |
| - print(" > Set 'LOGIN_COOKIE' variable permanently in script or use the '--cookie' CLI flag!") |
155 |
| - print("[debug] " + str(e)) |
| 42 | +if url.startswith('https://www.xing.com/pages/'): |
| 43 | + try: |
| 44 | + _, _, company = url.partition('pages/') |
| 45 | + |
| 46 | + postdata1 = {"operationName":"EntitySubpage","variables":{"id":company,"moduleType":"employees"},"query":"query EntitySubpage($id: SlugOrID!, ) { entityPageEX(id: $id) { ... on EntityPage { slug title context { companyId } } } }"} |
| 47 | + r = requests.post(api, data=json.dumps(postdata1), headers=headers, cookies=cookies_dict, timeout=200) |
| 48 | + response1 = r.json() |
| 49 | + companyID = response1["data"]["entityPageEX"]["context"]["companyId"] |
| 50 | + companyTitle = response1["data"]["entityPageEX"]["title"] |
| 51 | + |
| 52 | + postdata2 = {"operationName":"Employees","variables":{"consumer":"","id":companyID,"first":count,"query":{"consumer":"web.entity_pages.employees_subpage","sort":"CONNECTION_DEGREE"}},"query":"query Employees($id: SlugOrID!, $first: Int, $after: String, $query: CompanyEmployeesQueryInput!, $consumer: String! = \"\", $includeTotalQuery: Boolean = false) { company(id: $id) { id totalEmployees: employees(first: 0, query: {consumer: $consumer}) @include(if: $includeTotalQuery) { total } employees(first: $first, after: $after, query: $query) { total edges { node { profileDetails { id firstName lastName displayName gender pageName location { displayLocation } occupations { subline } } } } } } }"} |
| 53 | + r2 = requests.post(api, data=json.dumps(postdata2), headers=headers, cookies=cookies_dict, timeout=200) |
| 54 | + response2 = r2.json() |
| 55 | + |
| 56 | + employees = [] |
| 57 | + |
| 58 | + if not args.output_json and not args.output_csv: |
| 59 | + print() |
| 60 | + print("[i] Company Name: " + companyTitle) |
| 61 | + print("[i] Company X-ID: " + companyID) |
| 62 | + print("[i] Company Slug: " + company) |
| 63 | + print("[i] Dumping Date: " + datetime.now().strftime("%d/%m/%Y %H:%M:%S")) |
| 64 | + if mailformat: |
| 65 | + print("[i] Email Format: " + mailformat) |
| 66 | + print() |
| 67 | + else: |
| 68 | + print() |
| 69 | + |
| 70 | + for emp in response2['data']['company']['employees']['edges']: |
| 71 | + pd = emp['node']['profileDetails'] |
| 72 | + firstname = pd['firstName'] |
| 73 | + lastname = pd['lastName'] |
| 74 | + gender = pd.get('gender', 'N/A') |
| 75 | + location = pd.get('location', {}).get('displayLocation', '').replace('**','').replace(', ',',') |
| 76 | + pagename = pd.get('pageName', '') |
| 77 | + profile_url = f"https://www.xing.com/profile/{pagename}" |
| 78 | + try: |
| 79 | + position = pd['occupations'][0]['subline'] |
| 80 | + except: |
| 81 | + position = "None" |
| 82 | + |
| 83 | + employee_entry = { |
| 84 | + "firstname": firstname, |
| 85 | + "lastname": lastname, |
| 86 | + "position": position, |
| 87 | + "gender": gender, |
| 88 | + "location": location, |
| 89 | + "profile": profile_url |
| 90 | + } |
| 91 | + |
| 92 | + if mailformat: |
| 93 | + firstname_clean = firstname.lower().replace(".", "").translate(special_char_map) |
| 94 | + lastname_clean = lastname.lower().replace(".", "").translate(special_char_map) |
| 95 | + employee_entry['email'] = mailformat.format(firstname_clean, lastname_clean) |
| 96 | + |
| 97 | + if args.full: |
| 98 | + postdata3 = {"operationName":"getXingId","variables":{"profileId":pagename},"query":"query getXingId($profileId: SlugOrID!, $actionsFilter: [AvailableAction!]) { profileModules(id: $profileId) { __typename xingIdModule(actionsFilter: $actionsFilter) { xingId { status { localizationValue __typename } __typename } __typename ...xingIdContactDetails } } } fragment xingIdContactDetails on XingIdModule { contactDetails { business { email fax { phoneNumber } mobile { phoneNumber } phone { phoneNumber } } __typename } __typename }"} |
| 99 | + r3 = requests.post(api, data=json.dumps(postdata3), headers=headers, cookies=cookies_dict, timeout=200) |
| 100 | + r3data = r3.json() |
| 101 | + try: |
| 102 | + contact = r3data['data']['profileModules']['xingIdModule']['contactDetails']['business'] |
| 103 | + employee_entry['business_email'] = contact.get('email', 'None') |
| 104 | + employee_entry['fax'] = contact.get('fax', {}).get('phoneNumber', 'None') |
| 105 | + employee_entry['mobile'] = contact.get('mobile', {}).get('phoneNumber', 'None') |
| 106 | + employee_entry['phone'] = contact.get('phone', {}).get('phoneNumber', 'None') |
| 107 | + except: |
| 108 | + employee_entry['business_email'] = employee_entry['fax'] = employee_entry['mobile'] = employee_entry['phone'] = 'None' |
| 109 | + |
| 110 | + employees.append(employee_entry) |
| 111 | + |
| 112 | + if not args.output_json and not args.output_csv: |
| 113 | + print("Firstname;Lastname;" + ("Email;" if mailformat else "") + "Position;Gender;Location;" + ("E-Mail;Fax;Mobile;Phone;" if args.full else "") + "Profile") |
| 114 | + for emp in employees: |
| 115 | + values = [emp['firstname'], emp['lastname']] |
| 116 | + if mailformat: |
| 117 | + values.append(emp['email']) |
| 118 | + values += [emp['position'], emp['gender'], emp['location']] |
| 119 | + if args.full: |
| 120 | + values += [emp['business_email'], emp['fax'], emp['mobile'], emp['phone']] |
| 121 | + values.append(emp['profile']) |
| 122 | + print(";".join(values)) |
| 123 | + print() |
| 124 | + |
| 125 | + if args.output_json: |
| 126 | + try: |
| 127 | + output = { |
| 128 | + "company_id": companyID, |
| 129 | + "company_url": url, |
| 130 | + "company_slug": company, |
| 131 | + "timestamp": datetime.now().isoformat(), |
| 132 | + "employees": employees |
| 133 | + } |
| 134 | + with open(args.output_json, 'w', encoding='utf-8') as f: |
| 135 | + json.dump(output, f, ensure_ascii=False, indent=4) |
| 136 | + print(f"[i] Results written to JSON: {args.output_json}") |
| 137 | + except Exception as e: |
| 138 | + print(f"[!] Error writing JSON: {e}") |
| 139 | + |
| 140 | + if args.output_csv: |
| 141 | + try: |
| 142 | + with open(args.output_csv, 'w', newline='', encoding='utf-8') as f: |
| 143 | + writer = csv.writer(f, delimiter=';') |
| 144 | + headers = ["Firstname", "Lastname"] |
| 145 | + if mailformat: |
| 146 | + headers.append("Email") |
| 147 | + headers += ["Position", "Gender", "Location"] |
| 148 | + if args.full: |
| 149 | + headers += ["E-Mail", "Fax", "Mobile", "Phone"] |
| 150 | + headers.append("Profile") |
| 151 | + writer.writerow(headers) |
| 152 | + for emp in employees: |
| 153 | + row = [emp['firstname'], emp['lastname']] |
| 154 | + if mailformat: |
| 155 | + row.append(emp['email']) |
| 156 | + row += [emp['position'], emp['gender'], emp['location']] |
| 157 | + if args.full: |
| 158 | + row += [emp['business_email'], emp['fax'], emp['mobile'], emp['phone']] |
| 159 | + row.append(emp['profile']) |
| 160 | + writer.writerow(row) |
| 161 | + print(f"[i] Results written to CSV: {args.output_csv}") |
| 162 | + except Exception as e: |
| 163 | + print(f"[!] Error writing CSV: {e}") |
| 164 | + |
| 165 | + print(f"[i] Successfully crawled {len(employees)} {companyTitle} employees. Hurray ^_-") |
| 166 | + |
| 167 | + except Exception as e: |
| 168 | + print("\n[!] Exception. Either API has changed and this script is broken or authentication failed.") |
| 169 | + print(" > Set 'LOGIN_COOKIE' variable permanently in script or use the '--cookie' CLI flag!") |
| 170 | + print(f"[debug] {e}") |
156 | 171 | else:
|
157 |
| - print() |
158 |
| - print("[!] Invalid URL provided.") |
159 |
| - print("[i] Example URL: 'https://www.xing.com/pages/appleretaildeutschlandgmbh'") |
| 172 | + print("\n[!] Invalid URL provided.") |
| 173 | + print("[i] Example URL: 'https://www.xing.com/pages/appleretaildeutschlandgmbh'") |
0 commit comments