Appearance
Computer Automation Guide
Overview
The AGB SDK provides comprehensive computer automation capabilities in the cloud. You can control desktop environments through mouse operations, keyboard input, screen capture, application management and window control. This guide covers everything from basic UI interactions to advanced desktop automation workflows.
Quick Reference (1 minute)
python
from agb import AGB
from agb.session_params import CreateSessionParams
from agb import MouseButton, ScrollDirection
agb = AGB()
params = CreateSessionParams(image_id="agb-computer-use-ubuntu-2204")
result = agb.create(params)
if result.success:
session = result.session
# Mouse operations
result = session.computer.click_mouse(500, 300, MouseButton.LEFT)
session.computer.move_mouse(600, 400)
session.computer.scroll(500, 500, ScrollDirection.DOWN, 3)
# Keyboard operations
session.computer.input_text("Hello, World!")
session.computer.press_keys(["Ctrl", "a"])
# Screen operations
screenshot_result = session.computer.screenshot()
screen_size = session.computer.get_screen_size()
# Application management
apps_result = session.computer.get_installed_apps()
start_result = session.computer.start_app(start_cmd)
session.computer.stop_app_by_pname(pname)
# Window management
windows_result = session.computer.list_root_windows()
if windows_result.success and windows_result.windows:
window_id = windows_result.windows[0].window_id
session.computer.activate_window(window_id)
session.computer.maximize_window(window_id)
agb.delete(session)
else:
print(f"Failed to create session: {result.error_message}")Basic Usage (5 minutes)
Session Setup
python
from agb import AGB
from agb.session_params import CreateSessionParams
agb = AGB()
params = CreateSessionParams(image_id="agb-computer-use-ubuntu-2204")
result = agb.create(params)
if result.success:
session = result.session
print(f"Session created: {session.session_id}")
else:
print(f"Failed to create session: {result.error_message}")
exit(1)Supported System Images:
agb-computer-use-ubuntu-2204- Linux desktop environment with GUI for computer automation
UI Operations (15 minutes)
Mouse Operations
Click Operations
python
from agb import MouseButton
# Left click
result = session.computer.click_mouse(x=500, y=300, button=MouseButton.LEFT)
if result.success:
print("Left click successful")
# Right click (context menu)
result = session.computer.click_mouse(x=500, y=300, button=MouseButton.RIGHT)
if result.success:
print("Right click successful")
# Middle click (scroll wheel)
result = session.computer.click_mouse(x=500, y=300, button=MouseButton.MIDDLE)
if result.success:
print("Middle click successful")
# Double left click
result = session.computer.click_mouse(x=500, y=300, button=MouseButton.DOUBLE_LEFT)
if result.success:
print("Double click successful")Supported Mouse Buttons:
MouseButton.LEFT- Single left clickMouseButton.RIGHT- Right click (context menu)MouseButton.MIDDLE- Middle click (scroll wheel)MouseButton.DOUBLE_LEFT- Double left click
Move Mouse
python
result = session.computer.move_mouse(x=600, y=400)
if result.success:
print("Mouse moved successfully")
# Get current cursor position
cursor_result = session.computer.get_cursor_position()
if cursor_result.success:
import json
cursor_data = json.loads(cursor_result.data)
print(f"Cursor at x={cursor_data['x']}, y={cursor_data['y']}")Drag Operations
python
from agb import MouseButton
# Drag with left button
result = session.computer.drag_mouse(
from_x=100,
from_y=100,
to_x=200,
to_y=200,
button=MouseButton.LEFT
)
if result.success:
print("Drag operation successful")
# Drag with right button
result = session.computer.drag_mouse(
from_x=300,
from_y=300,
to_x=400,
to_y=400,
button=MouseButton.RIGHT
)Note: DOUBLE_LEFT is not supported for drag operations. Use LEFT, RIGHT, or MIDDLE only.
Scroll Operations
python
from agb import ScrollDirection
# Scroll up
result = session.computer.scroll(x=500, y=500, direction=ScrollDirection.UP, amount=3)
if result.success:
print("Scrolled up successfully")
# Scroll down
result = session.computer.scroll(x=500, y=500, direction=ScrollDirection.DOWN, amount=5)
if result.success:
print("Scrolled down successfully")
# Horizontal scrolling
result = session.computer.scroll(x=500, y=500, direction=ScrollDirection.LEFT, amount=2)
result = session.computer.scroll(x=500, y=500, direction=ScrollDirection.RIGHT, amount=2)Supported Scroll Directions:
ScrollDirection.UP- Scroll upScrollDirection.DOWN- Scroll downScrollDirection.LEFT- Scroll leftScrollDirection.RIGHT- Scroll right
Keyboard Operations
Text Input
python
result = session.computer.input_text("Hello AGB!")
if result.success:
print("Text input successful")Key Combinations
python
# Press Ctrl+A to select all
result = session.computer.press_keys(keys=["Ctrl", "a"])
if result.success:
print("Select all command sent")
# Press Ctrl+C to copy
result = session.computer.press_keys(keys=["Ctrl", "c"])
# Press Ctrl+V to paste
result = session.computer.press_keys(keys=["Ctrl", "v"])Hold and Release Keys
python
# Hold Ctrl key
result = session.computer.press_keys(keys=["Ctrl"], hold=True)
if result.success:
print("Ctrl key held")
# Release Ctrl key
result = session.computer.release_keys(keys=["Ctrl"])
if result.success:
print("Ctrl key released")Screen Operations
Take Screenshot
python
result = session.computer.screenshot()
if result.success:
screenshot_url = result.data
print(f"Screenshot URL: {screenshot_url}")
# The URL points to the screenshot image in cloud storageGet Screen Information
python
result = session.computer.get_screen_size()
if result.success:
import json
screen_data = json.loads(result.data)
print(f"Screen size: {screen_data['width']}x{screen_data['height']}")
print(f"DPI scaling: {screen_data['dpiScalingFactor']}")Application Management (15 minutes)
Discovering Installed Applications
python
result = session.computer.get_installed_apps(
start_menu=True,
desktop=False,
ignore_system_apps=True
)
if result.success:
apps = result.data
print(f"Found {len(apps)} installed applications")
for app in apps[:5]: # Show first 5 apps
print(f"Name: {app.name}")
print(f"Start Command: {app.start_cmd}")
print(f"Stop Command: {app.stop_cmd if app.stop_cmd else 'N/A'}")
print(f"Work Directory: {app.work_directory if app.work_directory else 'N/A'}")
print("---")Parameters:
start_menu(bool): Include applications from Start Menudesktop(bool): Include desktop shortcutsignore_system_apps(bool): Filter out system applications
Starting Applications
Method 1: Start by Command
python
result = session.computer.start_app(start_cmd)
if result.success:
processes = result.data
print(f"Application started with {len(processes)} processes")
for process in processes:
print(f"Process: {process.pname} (PID: {process.pid})")Method 2: Start with Working Directory
python
start_cmd = "notepad.exe"
work_directory = "C:\\Users\\Public\\Documents"
result = session.computer.start_app(
start_cmd=start_cmd,
work_directory=work_directory
)
if result.success:
print("Application started with custom working directory")Method 3: Start from Installed Apps List
python
# Find and start a specific application
apps_result = session.computer.get_installed_apps()
if apps_result.success:
target_app = None
for app in apps_result.data:
if "Google Chrome" in app.name.lower():
target_app = app
break
if target_app:
print(f"Starting {target_app.name}...")
start_result = session.computer.start_app(target_app.start_cmd)
if start_result.success:
print("Application started successfully!")Monitoring Running Applications
python
result = session.computer.list_visible_apps()
if result.success:
visible_apps = result.data
print(f"Found {len(visible_apps)} visible applications")
for app in visible_apps:
print(f"App: {app.pname} (PID: {app.pid})")
if app.cmdline:
print(f"Command: {app.cmdline}")
print("---")Stopping Applications
Method 1: Stop by Process Name
python
result = session.computer.stop_app_by_pname("notepad.exe")
if result.success:
print("Application stopped successfully")
else:
print(f"Failed to stop application: {result.error_message}")Method 2: Stop by Process ID
python
# Get PID from start_app or list_visible_apps
start_result = session.computer.start_app(start_cmd)
if start_result.success:
target_pid = start_result.data[0].pid # Get first process PID
stop_result = session.computer.stop_app_by_pid(target_pid)
if stop_result.success:
print(f"Successfully stopped process {target_pid}")Method 3: Stop by Command
python
# Using stop command from installed apps
apps_result = session.computer.get_installed_apps()
if apps_result.success:
for app in apps_result.data:
if app.stop_cmd and "Google Chrome" in app.name.lower():
stop_result = session.computer.stop_app_by_cmd(app.stop_cmd)
if stop_result.success:
print(f"Stopped {app.name} using stop command")
breakWindow Management (20 minutes)
Important: Window management operations require that applications are already running. You must start an application first before you can manage its windows.
Discovering Windows
python
# First, start an application to have windows to manage
start_result = session.computer.start_app(start_cmd)
if not start_result.success:
print("Failed to start application")
exit(1)
# Wait a moment for the application to fully load
import time
time.sleep(2)
# List all root windows
result = session.computer.list_root_windows(timeout_ms=5000)
if result.success:
windows = result.windows
print(f"Found {len(windows)} windows")
for window in windows:
print(f"Title: {window.title}")
print(f"Window ID: {window.window_id}")
print(f"Process: {window.pname if window.pname else 'N/A'}")
print(f"PID: {window.pid if window.pid else 'N/A'}")
print("---")Getting Active Window
python
result = session.computer.get_active_window()
if result.success and result.window:
window = result.window
print(f"Active window: {window.title}")
print(f"Window ID: {window.window_id}")
print(f"Size: {window.width}x{window.height}")
else:
print("No active window found")Window Control Operations
Activate Window
python
# Get available windows
windows_result = session.computer.list_root_windows()
if windows_result.success and windows_result.windows:
window_id = windows_result.windows[0].window_id
result = session.computer.activate_window(window_id)
if result.success:
print("Window activated successfully")Maximize Window
python
windows_result = session.computer.list_root_windows()
if windows_result.success and windows_result.windows:
window_id = windows_result.windows[0].window_id
result = session.computer.maximize_window(window_id)
if result.success:
print("Window maximized successfully")Minimize Window
python
windows_result = session.computer.list_root_windows()
if windows_result.success and windows_result.windows:
window_id = windows_result.windows[0].window_id
result = session.computer.minimize_window(window_id)
if result.success:
print("Window minimized successfully")Restore Window
python
windows_result = session.computer.list_root_windows()
if windows_result.success and windows_result.windows:
window_id = windows_result.windows[0].window_id
result = session.computer.restore_window(window_id)
if result.success:
print("Window restored successfully")Resize Window
python
windows_result = session.computer.list_root_windows()
if windows_result.success and windows_result.windows:
window_id = windows_result.windows[0].window_id
result = session.computer.resize_window(window_id, 800, 600)
if result.success:
print("Window resized to 800x600")Fullscreen Window
python
windows_result = session.computer.list_root_windows()
if windows_result.success and windows_result.windows:
window_id = windows_result.windows[0].window_id
result = session.computer.fullscreen_window(window_id)
if result.success:
print("Window set to fullscreen")Close Window
python
# Note: Use with caution as it permanently closes windows
windows_result = session.computer.list_root_windows()
if windows_result.success and windows_result.windows:
window_id = windows_result.windows[0].window_id
result = session.computer.close_window(window_id)
if result.success:
print("Window closed successfully")Focus Mode
python
# Enable focus mode to reduce distractions
result = session.computer.focus_mode(on=True)
if result.success:
print("Focus mode enabled")
# Disable focus mode
result = session.computer.focus_mode(on=False)
if result.success:
print("Focus mode disabled")Complete Window Management Workflow
python
import time
def manage_application_window(session, app_name: str):
"""Complete workflow for managing an application window"""
# Step 1: Start the application
print(f"Starting {app_name}...")
start_result = session.computer.start_app(app_name)
if not start_result.success:
print(f"Failed to start {app_name}")
return False
# Step 2: Wait for application to load
time.sleep(3)
# Step 3: Find the application window
windows_result = session.computer.list_root_windows()
if not windows_result.success:
print("Failed to list windows")
return False
target_window = None
for window in windows_result.windows:
if app_name.lower().replace('.exe', '') in window.title.lower():
target_window = window
break
if not target_window:
print(f"Window for {app_name} not found")
return False
print(f"Found window: {target_window.title}")
# Step 4: Perform window operations
window_id = target_window.window_id
# Activate the window
session.computer.activate_window(window_id)
print("Window activated")
time.sleep(1)
# Maximize the window
session.computer.maximize_window(window_id)
print("Window maximized")
time.sleep(1)
# Resize the window
session.computer.resize_window(window_id, 1024, 768)
print("Window resized")
time.sleep(1)
# Restore the window
session.computer.restore_window(window_id)
print("Window restored")
return True
# Usage example - Get app from installed apps list
apps_result = session.computer.get_installed_apps()
if apps_result.success and apps_result.data:
# Find a suitable app (e.g., notepad or text editor)
target_app = None
for app in apps_result.data:
if "Google Chrome" in app.name.lower() :
target_app = app
break
if target_app:
success = manage_application_window(session, target_app.start_cmd)
if success:
print("Window management completed successfully")
else:
print("No suitable application found in installed apps")
else:
print("Failed to get installed applications")Advanced Usage (20 minutes)
Automated Desktop Workflows
python
def automated_text_editing_workflow(session):
"""Automated workflow for text editing"""
# Get installed apps and find a text editor
apps_result = session.computer.get_installed_apps()
if not apps_result.success:
print("Failed to get installed applications")
return False
# Find a suitable text editor
first_app = None
for app in apps_result.data:
if "Google Chrome" in app.name.lower():
first_app = app
break
if not first_app:
print("No suitable text editor found in installed apps")
return False
# Start text editor using the proper start command
start_result = session.computer.start_app(first_app.start_cmd)
if not start_result.success:
print(f"Failed to start {first_app.name}")
return False
time.sleep(2)
# Find and activate the window
windows_result = session.computer.list_root_windows()
first_app_window = None
for window in windows_result.windows:
if "Google Chrome" in window.title.lower():
first_app_window = window
break
if first_app_window:
# Check if window has valid window_id before performing operations
if hasattr(first_app_window, 'window_id') and first_app_window.window_id:
session.computer.activate_window(first_app_window.window_id)
session.computer.maximize_window(first_app_window.window_id)
else:
print(f"Warning: Found window '{first_app_window.title}' but it has no valid window_id")
# Type some text
text_content = """Hello from AGB Computer Automation!
This text was automatically typed using the AGB SDK.
Features demonstrated:
1. Application launching
2. Window management
3. Text input
4. Keyboard shortcuts
"""
session.computer.input_text(text_content)
time.sleep(1)
# Select all text
session.computer.press_keys(["Ctrl", "a"])
time.sleep(0.5)
# Copy text
session.computer.press_keys(["Ctrl", "c"])
time.sleep(0.5)
# Move to end and paste
session.computer.press_keys(["Ctrl", "End"])
session.computer.press_keys(["Enter", "Enter"])
session.computer.input_text("Copied content:")
session.computer.press_keys(["Enter"])
session.computer.press_keys(["Ctrl", "v"])
print("Automated text editing workflow completed")
return True
# Run the workflow
automated_text_editing_workflow(session)Best Practices
1. Session Management
python
# ✅ Good: Always clean up sessions
def safe_automation_task():
agb = AGB()
params = CreateSessionParams(image_id="agb-computer-use-ubuntu-2204")
result = agb.create(params)
if not result.success:
print(f"Failed to create session: {result.error_message}")
return False
session = result.session
try:
# Perform automation tasks
result = session.computer.click_mouse(100, 100)
return result.success
finally:
# Always delete session
agb.delete(session)
print("Session cleaned up")
# Usage
success = safe_automation_task()2. Error Handling and Retry Logic
python
def robust_operation(session, operation_func, max_retries=3):
"""Perform operation with retry logic"""
for attempt in range(max_retries):
try:
result = operation_func()
if result.success:
return result
else:
print(f"Attempt {attempt + 1} failed: {result.error_message}")
if attempt < max_retries - 1:
time.sleep(2) # Wait before retry
except Exception as e:
print(f"Attempt {attempt + 1} exception: {e}")
if attempt < max_retries - 1:
time.sleep(2)
return None
# Usage example
def click_operation():
return session.computer.click_mouse(500, 300)
result = robust_operation(session, click_operation)
if result:
print("Operation succeeded")
else:
print("Operation failed after all retries")3. Coordinate Validation
python
def safe_click(session, x, y, button=MouseButton.LEFT):
"""Safely click with coordinate validation"""
# Get screen size first
screen_result = session.computer.get_screen_size()
if not screen_result.success:
print("Failed to get screen size")
return False
import json
screen_data = json.loads(screen_result.data)
max_x = screen_data['width']
max_y = screen_data['height']
# Validate coordinates
if x < 0 or x >= max_x or y < 0 or y >= max_y:
print(f"Invalid coordinates: ({x}, {y}). Screen size: {max_x}x{max_y}")
return False
# Perform click
result = session.computer.click_mouse(x, y, button)
return result.success
# Usage
success = safe_click(session, 500, 300)4. Application State Management
python
def ensure_application_ready(session, app_name, window_title_contains, timeout=30):
"""Ensure application is started and ready"""
import time
start_time = time.time()
# Start application
start_result = session.computer.start_app(app_name)
if not start_result.success:
return None
# Wait for window to appear
while time.time() - start_time < timeout:
windows_result = session.computer.list_root_windows()
if windows_result.success:
for window in windows_result.windows:
if hasattr(window, 'window_id') and window.window_id:
session.computer.activate_window(window.window_id)
return window
else:
print(f"Warning: Found matching window '{window.title}' but it has no valid window_id")
continue
time.sleep(1)
print(f"Timeout waiting for {app_name} window")
return None
# Usage
notepad_window = ensure_application_ready(session, "notepad.exe", "notepad")
if notepad_window:
print(f"Notepad ready: {notepad_window.title}")Troubleshooting
Common Issues
Coordinate Issues
python
# Check screen size before clicking
screen_result = session.computer.get_screen_size()
if screen_result.success:
import json
screen_data = json.loads(screen_result.data)
print(f"Screen dimensions: {screen_data['width']}x{screen_data['height']}")
print(f"DPI scaling: {screen_data['dpiScalingFactor']}")Application Not Starting
python
# Check installed applications
apps_result = session.computer.get_installed_apps()
if apps_result.success:
app_names = [app.name for app in apps_result.data]
print("Available applications:", app_names)
# Check if target app exists
target_app = "notepad.exe"
matching_apps = [app for app in apps_result.data if target_app.lower() in app.name.lower()]
if matching_apps:
print(f"Found matching apps: {[app.name for app in matching_apps]}")
else:
print(f"No apps found matching '{target_app}'")Window Not Found
python
# Debug window discovery
def debug_windows(session):
result = session.computer.list_root_windows()
if result.success:
print(f"Found {len(result.windows)} windows:")
for i, window in enumerate(result.windows):
print(f" {i+1}. Title: '{window.title}'")
print(f" ID: {window.window_id}")
print(f" Process: {window.pname}")
print()
else:
print(f"Failed to list windows: {result.error_message}")
debug_windows(session)Timing Issues
python
import time
# Add delays between operations
session.computer.start_app(start_cmd)
time.sleep(3) # Wait for app to start
session.computer.click_mouse(500, 300)
time.sleep(0.5) # Wait for click to register
session.computer.input_text("Hello")
time.sleep(1) # Wait for text inputSession Health Check
python
def check_session_health(session):
"""Check if session is healthy"""
try:
# Try a simple operation
result = session.computer.get_screen_size()
if result.success:
print("Session is healthy")
return True
else:
print(f"Session issue: {result.error_message}")
return False
except Exception as e:
print(f"Session error: {e}")
return False
# Usage
if not check_session_health(session):
print("Session needs to be recreated")Related Documentation
- Session Management Guide - Managing automation sessions
- API Reference - Complete computer automation API