-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathmain.py
More file actions
293 lines (233 loc) · 10 KB
/
main.py
File metadata and controls
293 lines (233 loc) · 10 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# Stagehand + Browserbase: Job Application Automation - See README.md for full documentation
import os
import random
import time
import httpx
from browserbase import Browserbase
from dotenv import load_dotenv
from playwright.sync_api import sync_playwright
from pydantic import BaseModel, Field, HttpUrl
from stagehand import Stagehand
# Load environment variables
load_dotenv()
# Define Pydantic schemas for structured data extraction
# Using schemas ensures consistent data extraction even if page layout changes
class JobInfo(BaseModel):
url: HttpUrl = Field(..., description="Job URL")
title: str = Field(..., description="Job title")
class JobsData(BaseModel):
jobs: list[JobInfo]
def get_project_concurrency() -> int:
"""
Fetch project concurrency limit from Browserbase SDK.
Retrieves the maximum concurrent sessions allowed for the project,
capped at 5.
"""
bb = Browserbase(api_key=os.environ.get("BROWSERBASE_API_KEY"))
project = bb.projects.retrieve(os.environ.get("BROWSERBASE_PROJECT_ID"))
return min(project.concurrency, 5)
def generate_random_email() -> str:
"""
Generate a random email address for form submission.
"""
random_string = "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=8))
return f"agent-{random_string}@example.com"
def generate_agent_id() -> str:
"""
Generate a unique agent identifier for job applications.
Combines timestamp and random string to ensure uniqueness across
multiple job applications and sessions.
"""
timestamp = int(time.time() * 1000)
random_string = "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=7))
return f"agent-{timestamp}-{random_string}"
def apply_to_job(job_info: JobInfo):
"""
Apply to a single job posting with automated form filling.
Uses Stagehand to navigate to job page, fill out application form,
upload resume, and submit the application.
"""
# Initialize Stagehand with Browserbase for cloud-based browser automation
client = Stagehand(
browserbase_api_key=os.environ.get("BROWSERBASE_API_KEY"),
)
# Start a new session
start_response = client.sessions.start(
model_name="google/gemini-2.5-flash",
)
session_id = start_response.data.session_id
try:
print(f"[{job_info.title}] Session Started")
print(f"[{job_info.title}] Watch live: https://browserbase.com/sessions/{session_id}")
# Connect to the browser via CDP
with sync_playwright() as playwright:
browser = playwright.chromium.connect_over_cdp(
f"wss://connect.browserbase.com?apiKey={os.environ['BROWSERBASE_API_KEY']}&sessionId={session_id}"
)
context = browser.contexts[0]
page = context.pages[0] if context.pages else context.new_page()
# Navigate to job URL
page.goto(str(job_info.url))
print(f"[{job_info.title}] Navigated to job page")
# Click on the specific job listing to open application form
client.sessions.act(
id=session_id,
input=f"click on {job_info.title}",
)
print(f"[{job_info.title}] Clicked on job")
# Generate unique identifiers for this application
agent_id = generate_agent_id()
email = generate_random_email()
print(f"[{job_info.title}] Agent ID: {agent_id}")
print(f"[{job_info.title}] Email: {email}")
# Fill out application form fields using natural language actions
# Stagehand's act() method understands natural language instructions
client.sessions.act(
id=session_id,
input=f"type '{agent_id}' into the agent identifier field",
)
client.sessions.act(
id=session_id,
input=f"type '{email}' into the contact endpoint field",
)
client.sessions.act(
id=session_id,
input="type 'us-west-2' into the deployment region field",
)
# Upload agent profile/resume file
# Using observe() to find the upload button, then setting files programmatically
observe_response = client.sessions.observe(
id=session_id,
instruction="find the file upload button for agent profile",
)
upload_actions = observe_response.data.results or []
if upload_actions and len(upload_actions) > 0:
upload_action = upload_actions[0]
upload_selector = (
str(upload_action.selector) if hasattr(upload_action, "selector") else None
)
if upload_selector:
file_input = page.locator(upload_selector)
# Fetch resume PDF from remote URL
# Using httpx to download the file before uploading
resume_url = "https://agent-job-board.vercel.app/Agent%20Resume.pdf"
with httpx.Client() as http_client:
response = http_client.get(resume_url)
if response.status_code != 200:
raise Exception(f"Failed to fetch resume: {response.status_code}")
resume_buffer = response.content
# Upload file using Playwright's set_input_files with buffer
file_input.set_input_files(
{
"name": "Agent Resume.pdf",
"mimeType": "application/pdf",
"buffer": resume_buffer,
}
)
print(f"[{job_info.title}] Uploaded resume from {resume_url}")
# Select multi-region deployment option
client.sessions.act(
id=session_id,
input="select 'Yes' for multi region deployment",
)
# Submit the application form
client.sessions.act(
id=session_id,
input="click deploy agent button",
)
print(f"[{job_info.title}] Application submitted successfully!")
browser.close()
client.sessions.end(id=session_id)
except Exception as error:
print(f"[{job_info.title}] Error: {error}")
client.sessions.end(id=session_id)
raise error
def main():
"""
Main application entry point.
Orchestrates the job application process:
1. Fetches project concurrency limits
2. Scrapes job listings from the job board
3. Applies to all jobs sequentially
"""
print("Starting Job Application Automation...")
# Get project concurrency limit
max_concurrency = get_project_concurrency()
print(f"Project concurrency limit: {max_concurrency}")
# Initialize Stagehand with Browserbase for cloud-based browser automation (main session for job scraping)
client = Stagehand(
browserbase_api_key=os.environ.get("BROWSERBASE_API_KEY"),
)
# Start a new session
start_response = client.sessions.start(
model_name="google/gemini-2.5-flash",
)
session_id = start_response.data.session_id
print("Main Stagehand Session Started")
print(f"Watch live: https://browserbase.com/sessions/{session_id}")
try:
# Connect to the browser via CDP
with sync_playwright() as playwright:
browser = playwright.chromium.connect_over_cdp(
f"wss://connect.browserbase.com?apiKey={os.environ['BROWSERBASE_API_KEY']}&sessionId={session_id}"
)
context = browser.contexts[0]
page = context.pages[0] if context.pages else context.new_page()
# Navigate to agent job board homepage
page.goto("https://agent-job-board.vercel.app/")
print("Navigated to agent-job-board.vercel.app")
# Click on "View Jobs" button to access job listings
client.sessions.act(
id=session_id,
input="click on the view jobs button",
)
print("Clicked on view jobs button")
# Extract all job listings with titles and URLs using inline schema (avoids $ref issues)
jobs_schema = {
"type": "object",
"properties": {
"jobs": {
"type": "array",
"items": {
"type": "object",
"properties": {
"url": {"type": "string", "description": "Job URL"},
"title": {"type": "string", "description": "Job title"},
},
"required": ["url", "title"],
},
}
},
"required": ["jobs"],
}
extract_response = client.sessions.extract(
id=session_id,
instruction="extract all job listings with their titles and URLs",
schema=jobs_schema,
)
jobs_result = extract_response.data.result
jobs_data = [
JobInfo(url=job["url"], title=job["title"]) for job in jobs_result.get("jobs", [])
]
print(f"Found {len(jobs_data)} jobs")
browser.close()
client.sessions.end(id=session_id)
except Exception as error:
print(f"Error during job scraping: {error}")
client.sessions.end(id=session_id)
raise error
# Apply to all jobs sequentially
print(f"Starting to apply to {len(jobs_data)} jobs...")
for job in jobs_data:
try:
apply_to_job(job)
except Exception as error:
print(f"Failed to apply to {job.title}: {error}")
continue
print("All applications completed!")
if __name__ == "__main__":
try:
main()
except Exception as err:
print(f"Error: {err}")
exit(1)