Skip to content

Commit 8c821bd

Browse files
committed
feat: add output options
1 parent 25679e8 commit 8c821bd

File tree

3 files changed

+154
-128
lines changed

3 files changed

+154
-128
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Ignore all JSON and CSV files
2+
*.json
3+
*.csv
4+
5+
# Optional: ignore specific output directories if you organize results
6+
/output/
7+
results/

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,16 @@ options:
4444
--count <number> Amount of employees to extract - max. 2999
4545
--cookie <cookie> XING 'login' cookie for authentication
4646
--full Dump additional contact details (slow) - email, phone, fax, mobile
47-
--quiet Show employee results only
4847
--email-format Python string format for emails; for example:
4948
--email-format '{0}.{1}@example.com' --> john.doe@example.com
5049
--email-format '{0[0]}.{1}@example.com' --> j.doe@example.com
5150
--email-format '{1}@example.com' --> doe@example.com
5251
--email-format '{0}@example.com' --> john@example.com
5352
--email-format '{0[0]}{1[0]}@example.com' --> jd@example.com
53+
--output-json <json-file>
54+
Store results in json output file
55+
--output-csv <csv-file>
56+
Store results in csv output file
5457
````
5558

5659
### 🐳 Example 1 - Docker Run
@@ -97,6 +100,8 @@ Isma;Abdan;isma.abdan@apple.de;Gabelstaplerfahrer;MALE;Huelva,Spanien;None;None;
97100
[i] Successfully crawled 2 Apple employees. Hurray ^_-
98101
````
99102

103+
Optionally, you can use the CLI parameters `--output-json` and `--output-csv` to store the results as JSON or CSV.
104+
100105
## 💥 Limitations
101106

102107
Dumped contact details via `--full` are most often empty. Germans seem to take privacy very seriously.

xingdumper.py

Lines changed: 141 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
import requests
22
import json
33
import argparse
4+
import csv
45
from argparse import RawTextHelpFormatter
56
from datetime import datetime
67

7-
# you may store your session cookie here persistently
88
LOGIN_COOKIE = "<INSERT-YOUR-XING-LOGIN-COOKIE-VALUE>"
99

10-
# converting german umlauts
1110
special_char_map = {ord('ä'):'ae', ord('ü'):'ue', ord('ö'):'oe', ord('ß'):'ss'}
1211

1312
format_examples = '''
@@ -21,139 +20,154 @@
2120
parser = argparse.ArgumentParser("xingdumper.py", formatter_class=RawTextHelpFormatter)
2221
parser.add_argument("--url", metavar='<xing-url>', help="A XING company url - https://xing.com/pages/<company>", type=str, required=True)
2322
parser.add_argument("--count", metavar='<number>', help="Amount of employees to extract - max. 2999", type=int, required=False)
24-
parser.add_argument("--cookie", metavar='<cookie>', help="XING 'login' cookie for authentication", type=str, required=False,)
25-
parser.add_argument("--full", help="Dump additional contact details (slow) - email, phone, fax, mobile", required=False, action='store_true')
26-
parser.add_argument("--quiet", help="Show employee results only", required=False, action='store_true')
27-
parser.add_argument("--email-format", help="Python string format for emails; for example:"+format_examples, required=False, type=str)
23+
parser.add_argument("--cookie", metavar='<cookie>', help="XING 'login' cookie for authentication", type=str, required=False)
24+
parser.add_argument("--full", help="Dump additional contact details (slow) - email, phone, fax, mobile", action='store_true')
25+
parser.add_argument("--email-format", help="Python string format for emails; for example:" + format_examples, metavar='<mail-format>', type=str)
26+
parser.add_argument("--output-json", help="Store results in json output file", metavar="<json-file>", type=str, required=False)
27+
parser.add_argument("--output-csv", help="Store results in csv output file", metavar="<csv-file>", type=str, required=False)
2828

2929
args = parser.parse_args()
3030
url = args.url
3131

32-
if (args.cookie):
33-
LOGIN_COOKIE = args.cookie
32+
if args.cookie:
33+
LOGIN_COOKIE = args.cookie
3434

35-
if (args.email_format):
36-
mailformat = args.email_format
37-
else:
38-
mailformat = False
39-
40-
if (args.count and args.count < 3000):
41-
count = args.count
42-
else:
43-
# according to XING, the result window must be less than 3000
44-
count = 2999
35+
mailformat = args.email_format if args.email_format else False
36+
count = args.count if args.count and args.count < 3000 else 2999
4537

4638
api = "https://www.xing.com/xing-one/api"
4739
headers = {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'Content-type': 'application/json'}
4840
cookies_dict = {"login": LOGIN_COOKIE}
4941

50-
if (url.startswith('https://www.xing.com/pages/')):
51-
try:
52-
before_keyword, keyword, after_keyword = url.partition('pages/')
53-
company = after_keyword
54-
55-
# retrieve company id from the api
56-
postdata1 = {"operationName":"EntitySubpage","variables":{"id":company,"moduleType":"employees"},"query":"query EntitySubpage($id: SlugOrID!, ) {\n entityPageEX(id: $id) {\n ... on EntityPage {\n slug\n title\n context {\n companyId\n }\n }\n }\n}\n"}
57-
r = requests.post(api, data=json.dumps(postdata1), headers=headers, cookies=cookies_dict, timeout=200)
58-
response1 = r.json()
59-
60-
companyID = response1["data"]["entityPageEX"]["context"]["companyId"]
61-
62-
# retrieve employee information from the api based on previously obtained company id
63-
postdata2 = {"operationName":"Employees","variables":{"consumer":"","id":companyID,"first":count,"query":{"consumer":"web.entity_pages.employees_subpage","sort":"CONNECTION_DEGREE"}},"query":"query Employees($id: SlugOrID!, $first: Int, $after: String, $query: CompanyEmployeesQueryInput!, $consumer: String! = \"\", $includeTotalQuery: Boolean = false) {\n company(id: $id) {\n id\n totalEmployees: employees(first: 0, query: {consumer: $consumer}) @include(if: $includeTotalQuery) {\n total\n }\n employees(first: $first, after: $after, query: $query) {\n total\n edges {\n node {\n profileDetails {\n id\n firstName\n lastName\n displayName\n gender\n pageName\n location {\n displayLocation\n }\n occupations {\n subline\n }\n }\n }\n }\n }\n }\n}\n"}
64-
r2 = requests.post(api, data=json.dumps(postdata2), headers=headers, cookies=cookies_dict, timeout=200)
65-
response2 = r2.json()
66-
67-
if not args.quiet:
68-
69-
print("""\
70-
71-
▒██ ██▒ ██▓ ███▄ █ ▄████ ▓█████▄ █ ██ ███▄ ▄███▓ ██▓███ ▓█████ ██▀███
72-
▒▒ █ █ ▒░▓██▒ ██ ▀█ █ ██▒ ▀█▒▒██▀ ██▌ ██ ▓██▒▓██▒▀█▀ ██▒▓██░ ██▒▓█ ▀ ▓██ ▒ ██▒
73-
░░ █ ░▒██▒▓██ ▀█ ██▒▒██░▄▄▄░░██ █▌▓██ ▒██░▓██ ▓██░▓██░ ██▓▒▒███ ▓██ ░▄█ ▒
74-
░ █ █ ▒ ░██░▓██▒ ▐▌██▒░▓█ ██▓░▓█▄ ▌▓▓█ ░██░▒██ ▒██ ▒██▄█▓▒ ▒▒▓█ ▄ ▒██▀▀█▄
75-
▒██▒ ▒██▒░██░▒██░ ▓██░░▒▓███▀▒░▒████▓ ▒▒█████▓ ▒██▒ ░██▒▒██▒ ░ ░░▒████▒░██▓ ▒██▒
76-
▒▒ ░ ░▓ ░░▓ ░ ▒░ ▒ ▒ ░▒ ▒ ▒▒▓ ▒ ░▒▓▒ ▒ ▒ ░ ▒░ ░ ░▒▓▒░ ░ ░░░ ▒░ ░░ ▒▓ ░▒▓░
77-
░░ ░▒ ░ ▒ ░░ ░░ ░ ▒░ ░ ░ ░ ▒ ▒ ░░▒░ ░ ░ ░ ░ ░░▒ ░ ░ ░ ░ ░▒ ░ ▒░
78-
░ ░ ▒ ░ ░ ░ ░ ░ ░ ░ ░ ░ ░ ░░░ ░ ░ ░ ░ ░░ ░ ░░ ░
79-
░ ░ ░ ░ ░ ░ ░ ░ ░ ░ by LRVT
80-
""")
81-
82-
print("[i] Company Name: " + response1["data"]["entityPageEX"]["title"])
83-
print("[i] Company X-ID: " + companyID)
84-
print("[i] Company Slug: " + company)
85-
print("[i] Dumping Date: " + datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
86-
if mailformat:
87-
print("[i] Email Format: " + mailformat)
88-
print()
89-
90-
if not mailformat:
91-
if args.full:
92-
legende = "Firstname;Lastname;Position;Gender;Location;E-Mail;Fax;Mobile;Phone;Profile"
93-
else:
94-
legende = "Firstname;Lastname;Position;Gender;Location;Profile"
95-
else:
96-
if args.full:
97-
legende = "Firstname;Lastname;Email;Position;Gender;Location;E-Mail;Fax;Mobile;Phone;Profile"
98-
else:
99-
legende = "Firstname;Lastname;Email;Position;Gender;Location;Profile"
100-
101-
print(legende)
102-
103-
dump_count = 0
104-
105-
# loop over employees
106-
for employee in response2['data']['company']['employees']['edges']:
107-
dump_count += 1
108-
firstname = employee['node']['profileDetails']['firstName']
109-
lastname = employee['node']['profileDetails']['lastName']
110-
try:
111-
position = employee['node']['profileDetails']['occupations'][0]['subline']
112-
except:
113-
position = "None"
114-
gender = employee['node']['profileDetails']['gender']
115-
location = employee['node']['profileDetails']['location']['displayLocation'].replace('**','').replace(', ',',')
116-
pagename = employee['node']['profileDetails']['pageName']
117-
118-
if args.full:
119-
# dump additional contact details for each employee. Most often is "None", so no default api queries for this data
120-
postdata3 = {"operationName":"getXingId","variables":{"profileId":pagename},"query":"query getXingId($profileId: SlugOrID!, $actionsFilter: [AvailableAction!]) {\n profileModules(id: $profileId) {\n __typename\n xingIdModule(actionsFilter: $actionsFilter) {\n xingId {\n status {\n localizationValue\n __typename\n }\n __typename\n }\n __typename\n ...xingIdContactDetails\n }\n }\n}\n\nfragment xingIdContactDetails on XingIdModule {\n contactDetails {\n business {\n email\n fax {\n phoneNumber\n }\n mobile {\n phoneNumber\n }\n phone {\n phoneNumber\n }\n }\n __typename\n }\n __typename\n}\n"}
121-
r3 = requests.post(api, data=json.dumps(postdata3), headers=headers, cookies=cookies_dict, timeout=200)
122-
response3 = r3.json()
123-
try:
124-
# try to extract contact details
125-
email = response3['data']['profileModules']['xingIdModule']['contactDetails']['business']['email']
126-
fax = response3['data']['profileModules']['xingIdModule']['contactDetails']['business']['fax']['phoneNumber']
127-
mobile = response3['data']['profileModules']['xingIdModule']['contactDetails']['business']['mobile']['phoneNumber']
128-
phone = response3['data']['profileModules']['xingIdModule']['contactDetails']['business']['phone']['phoneNumber']
129-
except:
130-
# if contact details are missing in the API response, set to 'None'
131-
email = "None"
132-
fax = "None"
133-
mobile = "None"
134-
phone = "None"
135-
136-
if not mailformat:
137-
# print employee information as Comma Separated Values (CSV)
138-
print(firstname + ";" + lastname + ";" + position + ";" + gender + ";" + location + ";" + str(email) + ";" + str(fax) + ";" + str(mobile) + ";" + str(phone) + ";" + "https://www.xing.com/profile/" + pagename)
139-
else:
140-
print(firstname + ";" + lastname + ";" + mailformat.format(firstname.lower().replace(".","").translate(special_char_map),lastname.lower().replace(".","").translate(special_char_map)) + ";" + position + ";" + gender + ";" + location + ";" + str(email) + ";" + str(fax) + ";" + str(mobile) + ";" + str(phone) + ";" + "https://www.xing.com/profile/" + pagename)
141-
else:
142-
if not mailformat:
143-
print(firstname + ";" + lastname + ";" + position + ";" + gender + ";" + location + ";" + "https://www.xing.com/profile/" + pagename)
144-
else:
145-
print(firstname + ";" + lastname + ";" + mailformat.format(firstname.lower().replace(".","").translate(special_char_map),lastname.lower().replace(".","").translate(special_char_map)) + ";" + position + ";" + gender + ";" + location + ";" + "https://www.xing.com/profile/" + pagename)
146-
147-
if not args.quiet:
148-
print()
149-
print("[i] Successfully crawled " + str(dump_count) + " " + response1["data"]["entityPageEX"]["title"] + " employees. Hurray ^_-")
150-
151-
except Exception as e:
152-
print()
153-
print("[!] Exception. Either API has changed and this script is broken or authentication failed.")
154-
print(" > Set 'LOGIN_COOKIE' variable permanently in script or use the '--cookie' CLI flag!")
155-
print("[debug] " + str(e))
42+
if url.startswith('https://www.xing.com/pages/'):
43+
try:
44+
_, _, company = url.partition('pages/')
45+
46+
postdata1 = {"operationName":"EntitySubpage","variables":{"id":company,"moduleType":"employees"},"query":"query EntitySubpage($id: SlugOrID!, ) { entityPageEX(id: $id) { ... on EntityPage { slug title context { companyId } } } }"}
47+
r = requests.post(api, data=json.dumps(postdata1), headers=headers, cookies=cookies_dict, timeout=200)
48+
response1 = r.json()
49+
companyID = response1["data"]["entityPageEX"]["context"]["companyId"]
50+
companyTitle = response1["data"]["entityPageEX"]["title"]
51+
52+
postdata2 = {"operationName":"Employees","variables":{"consumer":"","id":companyID,"first":count,"query":{"consumer":"web.entity_pages.employees_subpage","sort":"CONNECTION_DEGREE"}},"query":"query Employees($id: SlugOrID!, $first: Int, $after: String, $query: CompanyEmployeesQueryInput!, $consumer: String! = \"\", $includeTotalQuery: Boolean = false) { company(id: $id) { id totalEmployees: employees(first: 0, query: {consumer: $consumer}) @include(if: $includeTotalQuery) { total } employees(first: $first, after: $after, query: $query) { total edges { node { profileDetails { id firstName lastName displayName gender pageName location { displayLocation } occupations { subline } } } } } } }"}
53+
r2 = requests.post(api, data=json.dumps(postdata2), headers=headers, cookies=cookies_dict, timeout=200)
54+
response2 = r2.json()
55+
56+
employees = []
57+
58+
if not args.output_json and not args.output_csv:
59+
print()
60+
print("[i] Company Name: " + companyTitle)
61+
print("[i] Company X-ID: " + companyID)
62+
print("[i] Company Slug: " + company)
63+
print("[i] Dumping Date: " + datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
64+
if mailformat:
65+
print("[i] Email Format: " + mailformat)
66+
print()
67+
else:
68+
print()
69+
70+
for emp in response2['data']['company']['employees']['edges']:
71+
pd = emp['node']['profileDetails']
72+
firstname = pd['firstName']
73+
lastname = pd['lastName']
74+
gender = pd.get('gender', 'N/A')
75+
location = pd.get('location', {}).get('displayLocation', '').replace('**','').replace(', ',',')
76+
pagename = pd.get('pageName', '')
77+
profile_url = f"https://www.xing.com/profile/{pagename}"
78+
try:
79+
position = pd['occupations'][0]['subline']
80+
except:
81+
position = "None"
82+
83+
employee_entry = {
84+
"firstname": firstname,
85+
"lastname": lastname,
86+
"position": position,
87+
"gender": gender,
88+
"location": location,
89+
"profile": profile_url
90+
}
91+
92+
if mailformat:
93+
firstname_clean = firstname.lower().replace(".", "").translate(special_char_map)
94+
lastname_clean = lastname.lower().replace(".", "").translate(special_char_map)
95+
employee_entry['email'] = mailformat.format(firstname_clean, lastname_clean)
96+
97+
if args.full:
98+
postdata3 = {"operationName":"getXingId","variables":{"profileId":pagename},"query":"query getXingId($profileId: SlugOrID!, $actionsFilter: [AvailableAction!]) { profileModules(id: $profileId) { __typename xingIdModule(actionsFilter: $actionsFilter) { xingId { status { localizationValue __typename } __typename } __typename ...xingIdContactDetails } } } fragment xingIdContactDetails on XingIdModule { contactDetails { business { email fax { phoneNumber } mobile { phoneNumber } phone { phoneNumber } } __typename } __typename }"}
99+
r3 = requests.post(api, data=json.dumps(postdata3), headers=headers, cookies=cookies_dict, timeout=200)
100+
r3data = r3.json()
101+
try:
102+
contact = r3data['data']['profileModules']['xingIdModule']['contactDetails']['business']
103+
employee_entry['business_email'] = contact.get('email', 'None')
104+
employee_entry['fax'] = contact.get('fax', {}).get('phoneNumber', 'None')
105+
employee_entry['mobile'] = contact.get('mobile', {}).get('phoneNumber', 'None')
106+
employee_entry['phone'] = contact.get('phone', {}).get('phoneNumber', 'None')
107+
except:
108+
employee_entry['business_email'] = employee_entry['fax'] = employee_entry['mobile'] = employee_entry['phone'] = 'None'
109+
110+
employees.append(employee_entry)
111+
112+
if not args.output_json and not args.output_csv:
113+
print("Firstname;Lastname;" + ("Email;" if mailformat else "") + "Position;Gender;Location;" + ("E-Mail;Fax;Mobile;Phone;" if args.full else "") + "Profile")
114+
for emp in employees:
115+
values = [emp['firstname'], emp['lastname']]
116+
if mailformat:
117+
values.append(emp['email'])
118+
values += [emp['position'], emp['gender'], emp['location']]
119+
if args.full:
120+
values += [emp['business_email'], emp['fax'], emp['mobile'], emp['phone']]
121+
values.append(emp['profile'])
122+
print(";".join(values))
123+
print()
124+
125+
if args.output_json:
126+
try:
127+
output = {
128+
"company_id": companyID,
129+
"company_url": url,
130+
"company_slug": company,
131+
"timestamp": datetime.now().isoformat(),
132+
"employees": employees
133+
}
134+
with open(args.output_json, 'w', encoding='utf-8') as f:
135+
json.dump(output, f, ensure_ascii=False, indent=4)
136+
print(f"[i] Results written to JSON: {args.output_json}")
137+
except Exception as e:
138+
print(f"[!] Error writing JSON: {e}")
139+
140+
if args.output_csv:
141+
try:
142+
with open(args.output_csv, 'w', newline='', encoding='utf-8') as f:
143+
writer = csv.writer(f, delimiter=';')
144+
headers = ["Firstname", "Lastname"]
145+
if mailformat:
146+
headers.append("Email")
147+
headers += ["Position", "Gender", "Location"]
148+
if args.full:
149+
headers += ["E-Mail", "Fax", "Mobile", "Phone"]
150+
headers.append("Profile")
151+
writer.writerow(headers)
152+
for emp in employees:
153+
row = [emp['firstname'], emp['lastname']]
154+
if mailformat:
155+
row.append(emp['email'])
156+
row += [emp['position'], emp['gender'], emp['location']]
157+
if args.full:
158+
row += [emp['business_email'], emp['fax'], emp['mobile'], emp['phone']]
159+
row.append(emp['profile'])
160+
writer.writerow(row)
161+
print(f"[i] Results written to CSV: {args.output_csv}")
162+
except Exception as e:
163+
print(f"[!] Error writing CSV: {e}")
164+
165+
print(f"[i] Successfully crawled {len(employees)} {companyTitle} employees. Hurray ^_-")
166+
167+
except Exception as e:
168+
print("\n[!] Exception. Either API has changed and this script is broken or authentication failed.")
169+
print(" > Set 'LOGIN_COOKIE' variable permanently in script or use the '--cookie' CLI flag!")
170+
print(f"[debug] {e}")
156171
else:
157-
print()
158-
print("[!] Invalid URL provided.")
159-
print("[i] Example URL: 'https://www.xing.com/pages/appleretaildeutschlandgmbh'")
172+
print("\n[!] Invalid URL provided.")
173+
print("[i] Example URL: 'https://www.xing.com/pages/appleretaildeutschlandgmbh'")

0 commit comments

Comments
 (0)