templates/python/job-application/main.py at dev · browserbase/templates · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# Stagehand + Browserbase: Job Application Automation - See README.md for full documentation

import os
import random
import time

import httpx
from browserbase import Browserbase
from dotenv import load_dotenv
from playwright.sync_api import sync_playwright
from pydantic import BaseModel, Field, HttpUrl

from stagehand import Stagehand

# Load environment variables
load_dotenv()


# Define Pydantic schemas for structured data extraction
# Using schemas ensures consistent data extraction even if page layout changes
class JobInfo(BaseModel):
    url: HttpUrl = Field(..., description="Job URL")
    title: str = Field(..., description="Job title")


class JobsData(BaseModel):
    jobs: list[JobInfo]


def get_project_concurrency() -> int:
    """
    Fetch project concurrency limit from Browserbase SDK.

    Retrieves the maximum concurrent sessions allowed for the project,
    capped at 5.
    """
    bb = Browserbase(api_key=os.environ.get("BROWSERBASE_API_KEY"))
    project = bb.projects.retrieve(os.environ.get("BROWSERBASE_PROJECT_ID"))
    return min(project.concurrency, 5)


def generate_random_email() -> str:
    """
    Generate a random email address for form submission.

    """
    random_string = "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=8))
    return f"agent-{random_string}@example.com"


def generate_agent_id() -> str:
    """
    Generate a unique agent identifier for job applications.

    Combines timestamp and random string to ensure uniqueness across
    multiple job applications and sessions.
    """
    timestamp = int(time.time() * 1000)
    random_string = "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=7))
    return f"agent-{timestamp}-{random_string}"


def apply_to_job(job_info: JobInfo):
    """
    Apply to a single job posting with automated form filling.

    Uses Stagehand to navigate to job page, fill out application form,
    upload resume, and submit the application.
    """
    # Initialize Stagehand with Browserbase for cloud-based browser automation
    client = Stagehand(
        browserbase_api_key=os.environ.get("BROWSERBASE_API_KEY"),
    )

    # Start a new session
    start_response = client.sessions.start(
        model_name="google/gemini-2.5-flash",
    )
    session_id = start_response.data.session_id

    try:
        print(f"[{job_info.title}] Session Started")
        print(f"[{job_info.title}] Watch live: https://browserbase.com/sessions/{session_id}")

        # Connect to the browser via CDP
        with sync_playwright() as playwright:
            browser = playwright.chromium.connect_over_cdp(
                f"wss://connect.browserbase.com?apiKey={os.environ['BROWSERBASE_API_KEY']}&sessionId={session_id}"
            )
            context = browser.contexts[0]
            page = context.pages[0] if context.pages else context.new_page()

            # Navigate to job URL
            page.goto(str(job_info.url))
            print(f"[{job_info.title}] Navigated to job page")

            # Click on the specific job listing to open application form
            client.sessions.act(
                id=session_id,
                input=f"click on {job_info.title}",
            )
            print(f"[{job_info.title}] Clicked on job")

            # Generate unique identifiers for this application
            agent_id = generate_agent_id()
            email = generate_random_email()

            print(f"[{job_info.title}] Agent ID: {agent_id}")
            print(f"[{job_info.title}] Email: {email}")

            # Fill out application form fields using natural language actions
            # Stagehand's act() method understands natural language instructions
            client.sessions.act(
                id=session_id,
                input=f"type '{agent_id}' into the agent identifier field",
            )

            client.sessions.act(
                id=session_id,
                input=f"type '{email}' into the contact endpoint field",
            )

            client.sessions.act(
                id=session_id,
                input="type 'us-west-2' into the deployment region field",
            )

            # Upload agent profile/resume file
            # Using observe() to find the upload button, then setting files programmatically
            observe_response = client.sessions.observe(
                id=session_id,
                instruction="find the file upload button for agent profile",
            )
            upload_actions = observe_response.data.results or []

            if upload_actions and len(upload_actions) > 0:
                upload_action = upload_actions[0]
                upload_selector = (
                    str(upload_action.selector) if hasattr(upload_action, "selector") else None
                )
                if upload_selector:
                    file_input = page.locator(upload_selector)

                    # Fetch resume PDF from remote URL
                    # Using httpx to download the file before uploading
                    resume_url = "https://agent-job-board.vercel.app/Agent%20Resume.pdf"
                    with httpx.Client() as http_client:
                        response = http_client.get(resume_url)
                        if response.status_code != 200:
                            raise Exception(f"Failed to fetch resume: {response.status_code}")
                        resume_buffer = response.content

                    # Upload file using Playwright's set_input_files with buffer
                    file_input.set_input_files(
                        {
                            "name": "Agent Resume.pdf",
                            "mimeType": "application/pdf",
                            "buffer": resume_buffer,
                        }
                    )
                    print(f"[{job_info.title}] Uploaded resume from {resume_url}")

            # Select multi-region deployment option
            client.sessions.act(
                id=session_id,
                input="select 'Yes' for multi region deployment",
            )

            # Submit the application form
            client.sessions.act(
                id=session_id,
                input="click deploy agent button",
            )

            print(f"[{job_info.title}] Application submitted successfully!")

            browser.close()

        client.sessions.end(id=session_id)

    except Exception as error:
        print(f"[{job_info.title}] Error: {error}")
        client.sessions.end(id=session_id)
        raise error


def main():
    """
    Main application entry point.

    Orchestrates the job application process:
    1. Fetches project concurrency limits
    2. Scrapes job listings from the job board
    3. Applies to all jobs sequentially
    """
    print("Starting Job Application Automation...")

    # Get project concurrency limit
    max_concurrency = get_project_concurrency()
    print(f"Project concurrency limit: {max_concurrency}")

    # Initialize Stagehand with Browserbase for cloud-based browser automation (main session for job scraping)
    client = Stagehand(
        browserbase_api_key=os.environ.get("BROWSERBASE_API_KEY"),
    )

    # Start a new session
    start_response = client.sessions.start(
        model_name="google/gemini-2.5-flash",
    )
    session_id = start_response.data.session_id

    print("Main Stagehand Session Started")
    print(f"Watch live: https://browserbase.com/sessions/{session_id}")

    try:
        # Connect to the browser via CDP
        with sync_playwright() as playwright:
            browser = playwright.chromium.connect_over_cdp(
                f"wss://connect.browserbase.com?apiKey={os.environ['BROWSERBASE_API_KEY']}&sessionId={session_id}"
            )
            context = browser.contexts[0]
            page = context.pages[0] if context.pages else context.new_page()

            # Navigate to agent job board homepage
            page.goto("https://agent-job-board.vercel.app/")
            print("Navigated to agent-job-board.vercel.app")

            # Click on "View Jobs" button to access job listings
            client.sessions.act(
                id=session_id,
                input="click on the view jobs button",
            )
            print("Clicked on view jobs button")

            # Extract all job listings with titles and URLs using inline schema (avoids $ref issues)
            jobs_schema = {
                "type": "object",
                "properties": {
                    "jobs": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "url": {"type": "string", "description": "Job URL"},
                                "title": {"type": "string", "description": "Job title"},
                            },
                            "required": ["url", "title"],
                        },
                    }
                },
                "required": ["jobs"],
            }
            extract_response = client.sessions.extract(
                id=session_id,
                instruction="extract all job listings with their titles and URLs",
                schema=jobs_schema,
            )
            jobs_result = extract_response.data.result

            jobs_data = [
                JobInfo(url=job["url"], title=job["title"]) for job in jobs_result.get("jobs", [])
            ]
            print(f"Found {len(jobs_data)} jobs")

            browser.close()

        client.sessions.end(id=session_id)

    except Exception as error:
        print(f"Error during job scraping: {error}")
        client.sessions.end(id=session_id)
        raise error

    # Apply to all jobs sequentially
    print(f"Starting to apply to {len(jobs_data)} jobs...")

    for job in jobs_data:
        try:
            apply_to_job(job)
        except Exception as error:
            print(f"Failed to apply to {job.title}: {error}")
            continue

    print("All applications completed!")


if __name__ == "__main__":
    try:
        main()
    except Exception as err:
        print(f"Error: {err}")
        exit(1)