As I’m graduating and entering the job market, I entertain the idea of working for the Dutch government. While looking through the job vacancies on werkenvoornederland.nl I wanted to filter on more specific criteria, and was curios about the trends and requirements for different departments. I figured it would be helpfull for my search to scrape the site, and intresting to get a better overview of the job market.

The scraped data is published here (google sheets). Last update 11 April 2025. Below is the code I used to scrape the data if you want to try it yourself.

I’m still working on automating the process with a github action or cron job on my raspberry pi so I can update the data daily.

While I have little ethical objections to scraping the site, I do have concerns about automating the application process. Spinning up a selenium project with an AI model to send out personalized applications for each vacancy doesn’t seem like too hard, but I prefer having to keep some thought into the applications for now.

Fetch all vacancies URLs

import requests
from bs4 import BeautifulSoup
from bs4 import NavigableString

import pandas as pd
from datetime import datetime
import os
import random
import time

BASE_URL = "https://www.werkenvoornederland.nl"
OUTPUT_FILE = "output.csv"
ARCHIVE_FILE = "output_archive.csv"

ALL_FIELDS = [
    "Titel", "Link", "Plaatsingsdatum", "Sluitingsdatum",
    "Locatie", "Niveau", "Uren", "Salaris", "Vakgebied", "Arbeidsovereenkomst"
]

DUTCH_MONTHS = {
    "januari": "01", "februari": "02", "maart": "03", "april": "04",
    "mei": "05", "juni": "06", "juli": "07", "augustus": "08",
    "september": "09", "oktober": "10", "november": "11", "december": "12"
}

# Dutch date to dd-mm-yyyy
def convert_dutch_date(date_str):
    parts = date_str.strip().lower().split()
    if len(parts) == 3:
        day, month_dutch, year = parts
        month = DUTCH_MONTHS.get(month_dutch.lower(), "01")
        return f"{day.zfill(2)}-{month}-{year}"
    return None

# Fetch vacancies list
def fetch_vacancies(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    vacancies = []
    today = datetime.today()

    for section in soup.find_all("section", class_="vacancy"):
        title_tag = section.find("h2", class_="vacancy__title").find("a")
        title = title_tag.text.strip()
        relative_link = title_tag.get("href")
        full_link = BASE_URL + relative_link

        plaatsingsdatum = ""
        top_info = section.find("div", class_="job-short-info__top")
        if top_info:
            for line in top_info.stripped_strings:
                if "Plaatsingsdatum:" in line:
                    raw_date = line.replace("Plaatsingsdatum:", "").strip()
                    plaatsingsdatum = convert_dutch_date(raw_date)

        sluitingsdatum = ""
        sluiting_tag = section.find("span", class_="vacancy-publication-end")
        if sluiting_tag:
            text = sluiting_tag.text.strip()
            if "voor" in text:
                raw_date = text.split("voor", 1)[-1].strip()
                sluitingsdatum = convert_dutch_date(raw_date)

        try:
            sluitingsdatum_obj = datetime.strptime(sluitingsdatum, "%d-%m-%Y")
            if sluitingsdatum_obj < today:
                continue
        except:
            continue

        vacancies.append({
            "Titel": title,
            "Link": full_link,
            "Plaatsingsdatum": plaatsingsdatum,
            "Sluitingsdatum": sluitingsdatum,
            "Locatie": None,
            "Niveau": None,
            "Uren": None,
            "Salaris": None,
            "Vakgebied": None,
            "Arbeidsovereenkomst": None
        })

    return vacancies





# LOGIC

existing_data = pd.DataFrame()
today = datetime.today().date()

# Load existing data
if os.path.exists(OUTPUT_FILE):
    existing_data = pd.read_csv(OUTPUT_FILE)
    existing_data["Sluitingsdatum"] = pd.to_datetime(existing_data["Sluitingsdatum"], format="%d-%m-%Y", errors="coerce")
    existing_data["Plaatsingsdatum"] = pd.to_datetime(existing_data["Plaatsingsdatum"], format="%d-%m-%Y", errors="coerce")

    # Identify expired vacancies
    expired_vacancies = existing_data[existing_data["Sluitingsdatum"].dt.date <= today]

    # Archive expired vacancies
    if not expired_vacancies.empty:
        if os.path.exists(ARCHIVE_FILE):
            archive_df = pd.read_csv(ARCHIVE_FILE)
            archive_df = pd.concat([archive_df, expired_vacancies])
            archive_df.drop_duplicates(subset=["Titel", "Link"], inplace=True)
        else:
            archive_df = expired_vacancies

        archive_df.to_csv(ARCHIVE_FILE, index=False, encoding="utf-8-sig")
        print(f"Archived {len(expired_vacancies)} expired vacancies.")

    # Keep only active vacancies
    existing_data = existing_data[existing_data["Sluitingsdatum"].dt.date > today]
    print(f"Local data loaded: {len(existing_data)} active vacancies.")

# Pages to scrape (check if todays vacancies are already scraped)
use_page_2 = False
if not existing_data.empty and (existing_data["Plaatsingsdatum"].dt.date == today).any():
    use_page_2 = True

url = f"{BASE_URL}/vacatures?pagina=1" if use_page_2 else f"{BASE_URL}/vacatures?pagina={random.randint(200, 250)}"
print(f"Scraping {url}")

# Fetch
new_vacancies = fetch_vacancies(url)
df_new = pd.DataFrame(new_vacancies)

# Merge, omit duplicates
if not existing_data.empty:
    combined = pd.concat([existing_data, df_new])
    combined.drop_duplicates(subset=["Titel", "Link"], inplace=True)
else:
    combined = df_new

combined = combined.reindex(columns=ALL_FIELDS)
combined.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
print(f"Saved {len(combined)} active vacancies.")

Fetch details for each vacancy

# Load CSV
OUTPUT_FILE = "output.csv"
combined = pd.read_csv(OUTPUT_FILE, encoding="utf-8-sig")
combined.fillna("", inplace=True)

# Track all columns dynamically (possibly the accordion sections differ)
ALL_FIELDS = list(combined.columns)

# normalize title fields
def normalize_title(title):
    return title.lower().replace(" ", "_").replace("-", "_").replace("­", "_")

# scrape dynamic 'accordion' sections
def scrape_accordion_sections(soup):
    accordion_data = {}
    accordion_div = soup.find("div", {"id": "accordionGroup"})

    if accordion_div:
        items = accordion_div.find_all("div", class_="Accordion-panel")
        for panel in items:
            try:
                button = panel.find_previous("button")
                if button:
                    raw_title = button.get("data-item") or button.get_text(strip=True)
                    title = normalize_title(raw_title)

                    content_div = panel.find("div", class_="s-article-content")
                    if content_div:
                        text_parts = []
                        for tag in content_div.find_all(["p", "li", "ul"], recursive=True):
                            text = tag.get_text(strip=True, separator=" ")
                            if text:
                                text_parts.append(text)

                        accordion_data[title] = "\n".join(text_parts)
            except Exception as e:
                print(f"Error parsing accordion section: {e}")
    return accordion_data

# Scrape individual page info
def scrape_details(row):
    link = row["Link"]
    print(f"Scraping details from {link}")

    try:
        response = requests.get(link)
        soup = BeautifulSoup(response.content, "html.parser")

        # LOCATIE
        locatie = None
        locatie_span = soup.find("span", {"title": "Locatie"})
        if locatie_span:
            sibling = locatie_span.find_next_sibling("span")
            if sibling:
                locatie = sibling.get_text(strip=True)

        # NIVEAU
        niveau = None
        niveau_span = soup.find("span", {"title": "Niveau"})
        if niveau_span:
            sibling = niveau_span.find_next_sibling("span")
            if sibling:
                niveau = sibling.get_text(strip=True)

        # SALARIS
        salaris = None
        salaris_span = soup.find("span", {"title": "Salaris"})
        if salaris_span:
            value_container = salaris_span.find_next_sibling("span")
            if value_container:
                salaris = " ".join(value_container.stripped_strings)

        # VAKGEBIED
        vakgebied = None
        vakgebied_span = soup.find("span", {"title": "Vakgebied"})
        if vakgebied_span:
            sibling = vakgebied_span.find_next_sibling("span")
            if sibling:
                vakgebied = sibling.get_text(strip=True)

        # ARBEIDSOVEREENKOMST
        arbeidsovereenkomst = None
        arbeid_span = soup.find("span", {"title": "Arbeidsovereenkomst"})
        if arbeid_span:
            sibling = arbeid_span.find_next_sibling("span")
            if sibling:
                arbeidsovereenkomst = sibling.get_text(strip=True)

        # update basic fields
        row["Locatie"] = locatie
        row["Niveau"] = niveau
        row["Salaris"] = salaris
        row["Vakgebied"] = vakgebied
        row["Arbeidsovereenkomst"] = arbeidsovereenkomst

        # scrape accordion content
        accordion_sections = scrape_accordion_sections(soup)

        # add dynamic fields to the row
        for key, value in accordion_sections.items():
            row[key] = value
            if key not in ALL_FIELDS:
                ALL_FIELDS.append(key)

    except Exception as e:
        print(f"Failed to scrape {link}: {e}")

    return row

# process each row that needs scraping
for idx, row in combined.iterrows():
    print(f"[{idx+1}/{len(combined)}] Checking: {row['Titel']}")
    if row["Locatie"] == "" and row["Uren"] == "" and row["Salaris"] == "":
        updated_row = scrape_details(row)

        # update fixed fields
        for field in ["Locatie", "Niveau", "Salaris", "Vakgebied", "Arbeidsovereenkomst"]:
            combined.at[idx, field] = updated_row.get(field, "")

        # ypdate dynamic accordion fields
        for key in updated_row.keys():
            if key not in combined.columns:
                combined[key] = ""  # create column if missing
            combined.at[idx, key] = updated_row[key]

        # save data (after each scrape so we can quit safely)
        combined.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
        print(f"Updated: {row['Titel']}")
        time.sleep(random.randint(1, 5))  # Rate limit

print("done scraping missing details!")