Scrapping Linkedin

Ashwini Kumar Maurya
4 min readJan 3, 2021

Scrapping linkedin could be very beneficial for professional looking to find new opportunities, for employers to identify the potential employee churn or head hunters to find the talent they are looking for. Following the lawsuit between HiQ Labs v LinkedIn which favoured HiQ Labs, Linkedin allows scrapping the data from its website.

For this story, I use Selenium to scrape the linkedin data as its easy to automate the process when extracting data for multiple profiles.

Necessary Software

To use selenium, you must have installed the WebDriver. Chrome Webdriver is good choice.

import string, pandas as pd, os, sys, json, time, re, itertools
from parsel import Selector
from collections import deque
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
pd.set_option('display.max.rows',1000)
pd.set_option('display.max.columns',1000)

Setting up driver

I wrote the following function to get the driver logged into to linked account. Login in necessary to browse the profile details.

def get_driver_details():
driver = webdriver.Chrome(executable_path=’/Users/amaur24/Desktop/scrape/chromedriver’)
driver.get(‘https://www.linkedin.com/uas/login?session_redirect=%2Fvoyager%2FloginRedirect%2Ehtml&fromSignIn=true&trk=uno-reg-join-sign-in')
try:
userid='user_name'
pwd='password'
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="username"]'))).send_keys(userid)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="password"]'))).send_keys(pwd)
log_in_button = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="app__container”]/main/div[2]/form/div[3]/button')))
log_in_button.click()

except Exception:
driver.quit()

return driver

driver=get_driver_details()

Google Search of Profiles

One can search linkedin profiles using google search given name of skill/title , location etc. For instance the following google search query is returns result for linkedin data scientist users from San Francisco.

search_query=’site:linkedin.com/in/ AND ‘+'Data Scientist'+’ AND ‘+'San Francisco'

The function below automates this search and returns the linkedin web link of the searched profiles:

def get_profile_links(driver,skill,location,search_pages=None):
driver.get(‘https://www.google.com')
query=driver.find_element_by_name(‘q’)
search_query=’site:linkedin.com/in/ AND ‘+skill+’ AND ‘+location
query.send_keys(search_query)
query.send_keys(Keys.RETURN)

counter = 1;
all_links=[]

if search_pages is None:
search_pages=1
while counter <= search_pages:
try:
driver.find_element_by_xpath(“//*[contains(local-name(), ‘span’) and contains(text(), ‘Next’)]”).click()
time.sleep(1)
linkedin_urls = driver.find_elements_by_class_name(‘yuRUbf’)
ps=driver.page_source
search_results=ps.split(“Search Result”)[1]
profile_links=search_results.split(‘yuRUbf’)[1:]
all_links=all_links+ profile_links
counter = counter+1
except Exception:
pass
return all_links
def get_profile_lists(profile_links):

profile_lists=[]

for i in range(len(profile_links)):
text=profile_links[i]
text=text[text.find("https:"):]
for c in url_permissible_chars:
if c in text:
text=text.split(c)[0]
profile_name=text.split(" ")[0].strip().replace('"',"").replace("'","")
profile_lists.append(profile_name)
return profile_lists

Fetch Links:
skill=’Data Scientist’; location=San Francisco’
profiles=get_profile_lists(get_profile_links(driver=driver,skill=skill,location=location,search_pages=10))

Fetching Profile Data:

Next we can define a customized functions to fetch section of profile. For instance the following function extracts names of skills listed on Linkedin profile.

def get_data_from_FullProfileSkill(D):

result=dict()
try:
if ‘name’ in D:
skill_name=D[‘name’]
else:
skill_name=’’
except Exception:
skill_name=’’

result.update({‘skill_name’:skill_name})
return result

Function to get the JSON text and relevant Elements

## define a stack to get only json string from the page_sourcedef get_json_text(text):
S=[]
D=deque()
D.append(‘{‘) ## first { character
S.append(‘{‘)
for c in text[1:]:
if c==’{‘:
D.append(‘{‘)
if c==’}’:
D.pop()
S.append(c)
if len(D)==0:
S1=””.join([x for x in S])
return json.loads(S1)
return print(“not json string”)
### get relevant profile data dicts: there are many elements which are not relevant to profile. We can ignore these.def get_relevant_list_of_dict(L):
L_new=[]
for i in range(len(L)):
if (‘paging’ in L[i]) or (‘urn:li:fsd_company’ in L[i][‘entityUrn’]):
pass
else:
L_new.append(L[i])
return L_new

Function to parse the skills data from the profile page and return the dataframe.

def get_parsed_data(D):“””Given the dictionary of data, parses and returns the relevant details”””

result=dict()
if isinstance(D[‘$recipeTypes’],list):
recipeTypes=D[‘$recipeTypes’][0]
else:
recipeTypes=D[‘$recipeTypes’]
if recipeTypes==’com.linkedin.voyager.dash.deco.identity.profile.FullProfileSkill’:
result=get_data_from_FullProfileSkill(D)

return result
def get_profile_df(profile_data):

"""Returns a pandas dataframe of profile details"""
c_names=['first_name','last_name','job_profile_headline','skills','industry_experiences','certificates','profile_summary','current_location_city','current_country_name','fields_of_study','degree_names','school_names']
final_df=pd.DataFrame(columns=c_names)
result=pd.DataFrame()for info in profile_data:
parsed_info=get_parsed_data(info)
result=result.append(parsed_info,ignore_index=True)
if 'skill_name' in result.columns:
skill_names=result.loc[~(result.skill_name.isna())].skill_name
if any(skill_names):
skills=[x for x in skill_names if (isinstance(x,str)) and len(x)>0]
if len(skills)>1:
skills=",".join(skills)
elif len(skills)==1:
skills=skills[0]
else:
skills=''
else:
skills=''
else:
skills=''
result['skills']=skills
if result.shape[0]>0:
final_df=result[c_names].drop_duplicates()

return final_df

Finally get the result

prof_page=driver.get(profiles[0])
profile_page_source=driver.page_source
kwd_data=’{“data”:{“entityUrn”’;
data_strings=[kwd_data+x for x in profile_page_source.split(kwd_data)]
pos=-1
for i in range(len(data_strings)):
if data_strings[i].find(‘com.linkedin.voyager.dash.deco.identity.profile.FullProfileWithEntities’)>0:
pos=i
if pos>-1:
profile_info=get_json_text(data_strings[pos])
profile_data_all=profile_info[‘included’]
profile_data=get_relevant_list_of_dict(profile_data_all)
df=get_profile_df(profile_data)

Here is the profile page:

Skills listed on profile:

Here is skill names fetched using code:

Or just skills:

--

--