Reconnection and Error Recovery: Resilient Real-Time Apps
Network failures are inevitable: WiFi drops, proxies timeout, servers restart. A good real-time application handles these gracefully, reconnecting automatically and recovering state. This article implements server-side session recovery and client-side resilience patterns, ensuring users never lose context after a brief disconnection and messages sent offline are delivered on reconnection.
Client-Side Exponential Backoff Reconnection
The browser client should attempt reconnection with exponential backoff: immediately retry, wait 1 second, 2 seconds, 4 seconds, capping at a maximum (e.g., 60 seconds):
class ResilientWebSocket {
constructor(url, options = {}) {
this.url = url;
this.options = {
max_retries: options.max_retries || 10,
initial_delay: options.initial_delay || 1000, // 1 second
max_delay: options.max_delay || 60000, // 60 seconds
...options
};
this.ws = null;
this.retry_count = 0;
this.retry_timeout = null;
this.message_buffer = []; // Messages sent while offline
this.message_handlers = [];
this.state = 'disconnected'; // disconnected, connecting, connected
}
async connect() {
if (this.state === 'connecting' || this.state === 'connected') {
return;
}
this.state = 'connecting';
try {
this.ws = new WebSocket(this.url);
this.ws.onopen = () => {
console.log('WebSocket connected');
this.state = 'connected';
this.retry_count = 0;
// Flush buffered messages
this._flush_buffer();
};
this.ws.onmessage = (event) => {
for (const handler of this.message_handlers) {
handler(JSON.parse(event.data));
}
};
this.ws.onerror = (error) => {
console.error('WebSocket error:', error);
};
this.ws.onclose = () => {
console.log('WebSocket closed');
this.state = 'disconnected';
this._schedule_reconnect();
};
} catch (error) {
console.error('Connection error:', error);
this._schedule_reconnect();
}
}
_schedule_reconnect() {
if (this.retry_count >= this.options.max_retries) {
console.error('Max retries reached');
return;
}
// Exponential backoff: delay = min(initial_delay * 2^retry_count, max_delay)
const delay = Math.min(
this.options.initial_delay * Math.pow(2, this.retry_count),
this.options.max_delay
);
console.log(`Reconnecting in ${delay}ms (attempt ${this.retry_count + 1})`);
this.retry_count += 1;
this.retry_timeout = setTimeout(() => this.connect(), delay);
}
send(message) {
if (this.state === 'connected') {
this.ws.send(JSON.stringify(message));
} else {
// Buffer message for later
this.message_buffer.push(message);
console.log(`Buffered message (${this.message_buffer.length} pending)`);
}
}
_flush_buffer() {
while (this.message_buffer.length > 0) {
const message = this.message_buffer.shift();
this.ws.send(JSON.stringify(message));
}
console.log('Message buffer flushed');
}
on_message(handler) {
this.message_handlers.push(handler);
}
close() {
this.state = 'disconnected';
if (this.retry_timeout) {
clearTimeout(this.retry_timeout);
}
if (this.ws) {
this.ws.close();
}
}
}
// Usage in HTML
const ws = new ResilientWebSocket('ws://localhost:8000/ws?room=general&username=Alice&client_id=123');
ws.on_message((msg) => {
const div = document.createElement('div');
div.innerText = msg.text;
document.getElementById('messages').appendChild(div);
});
document.getElementById('send_btn').onclick = () => {
const text = document.getElementById('msg').value;
ws.send({
type: 'chat_message',
text: text,
timestamp: new Date().toISOString()
});
document.getElementById('msg').value = '';
};
ws.connect();
The message_buffer array holds messages sent while offline. When connection resumes, they're flushed to the server in order. This ensures no messages are lost due to network hiccups.
Server-Side Session Persistence and Recovery
Clients might reconnect with the same client_id but a new WebSocket. The server should restore session state (room, username, presence) instead of treating them as a new user:
class SessionManager:
def __init__(self, grace_period: int = 30):
self.active_sessions: Dict[str, dict] = {} # client_id -> session
self.closed_sessions: Dict[str, dict] = {} # client_id -> closed session
self.grace_period = grace_period # seconds to recover session
async def connect(self, client_id: str, username: str, room: str, websocket: WebSocket):
# Check if client is reconnecting
if client_id in self.closed_sessions:
closed_session = self.closed_sessions[client_id]
elapsed = (datetime.now() - closed_session["closed_at"]).total_seconds()
if elapsed < self.grace_period:
# Reconnection within grace period; restore session
print(f"Restoring session for {client_id} after {elapsed:.1f}s")
await websocket.accept()
self.active_sessions[client_id] = {
"username": closed_session["username"],
"room": closed_session["room"],
"websocket": websocket,
"connected_at": datetime.now(),
"reconnected": True
}
del self.closed_sessions[client_id]
# Notify peers that user reconnected
await self._notify_room(closed_session["room"], {
"type": "user_reconnected",
"username": closed_session["username"]
})
return
else:
# Grace period expired; treat as new user
del self.closed_sessions[client_id]
# New user
await websocket.accept()
self.active_sessions[client_id] = {
"username": username,
"room": room,
"websocket": websocket,
"connected_at": datetime.now(),
"reconnected": False
}
await self._notify_room(room, {
"type": "user_joined",
"username": username
})
async def disconnect(self, client_id: str):
if client_id not in self.active_sessions:
return
session = self.active_sessions.pop(client_id)
# Move to closed_sessions for grace period
self.closed_sessions[client_id] = {
"username": session["username"],
"room": session["room"],
"closed_at": datetime.now(),
"last_activity": session.get("last_activity", datetime.now())
}
print(f"Session {client_id} closed; will recover if reconnected within {self.grace_period}s")
# Notify peers (but don't say "left" yet; wait for grace period)
await self._notify_room(session["room"], {
"type": "user_away",
"username": session["username"]
})
# Schedule cleanup after grace period
await asyncio.sleep(self.grace_period)
if client_id in self.closed_sessions:
session = self.closed_sessions.pop(client_id)
await self._notify_room(session["room"], {
"type": "user_left",
"username": session["username"]
})
async def _notify_room(self, room: str, message: dict):
"""Broadcast to all users in a room."""
for cid, session in self.active_sessions.items():
if session["room"] == room:
try:
await session["websocket"].send_json(message)
except Exception:
pass
manager = SessionManager(grace_period=30)
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket, room: str = Query(...), username: str = Query(...), client_id: str = Query(...)):
await manager.connect(client_id, username, room, websocket)
try:
while True:
data = await websocket.receive_text()
# ... handle message ...
except Exception:
await manager.disconnect(client_id)
Now, if a client drops and reconnects within 30 seconds with the same client_id, their session is restored. Peers see user_away while they're gone, then user_reconnected when they return. If the grace period expires, peers see user_left.
Deduplication of Buffered Messages
If a buffered message is sent twice (e.g., client timeout logic sends it, then reconnect flushes it again), the server should deduplicate. Add a unique message ID:
class MessageDeduplicator:
def __init__(self, ttl_seconds: int = 300):
self.seen_ids: Dict[str, datetime] = {}
self.ttl = ttl_seconds
def is_duplicate(self, message_id: str) -> bool:
if message_id in self.seen_ids:
return True
self.seen_ids[message_id] = datetime.now()
# Cleanup old IDs
cutoff = datetime.now() - timedelta(seconds=self.ttl)
self.seen_ids = {mid: t for mid, t in self.seen_ids.items() if t > cutoff}
return False
deduplicator = MessageDeduplicator()
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket, ...):
await manager.connect(...)
try:
while True:
data = await websocket.receive_text()
msg = json.loads(data)
# Check for duplicate
msg_id = msg.get("id")
if msg_id and deduplicator.is_duplicate(msg_id):
# Already processed; skip
continue
# Process message
...
except Exception:
...
Clients should generate a unique ID for each message (UUID or timestamp-based). The server tracks these IDs for a few minutes and ignores duplicates.
Heartbeat and Liveness Checks
Send periodic pings to detect dead connections early:
async def heartbeat_loop(self, client_id: str):
"""Send ping frames every 30 seconds."""
while client_id in self.active_sessions:
await asyncio.sleep(30)
try:
session = self.active_sessions.get(client_id)
if session:
await session["websocket"].send_json({"type": "ping"})
except Exception:
await self.disconnect(client_id)
break
The client automatically responds with a pong (most WebSocket libraries do this). If no pong is received after 2 pings, the client is unresponsive and the server can close the connection.
Key Takeaways
- Exponential backoff prevents reconnection storms; start at 1 second, double each time, cap at 60 seconds.
- Message buffering ensures no messages are lost during brief disconnections; flush on reconnection.
- Server-side session recovery restores user context if they reconnect within a grace period (e.g., 30 seconds).
- Deduplication prevents duplicate message processing from buffered retries.
- Heartbeats detect dead connections early, avoiding silent failures.
- Distinguish between
user_away(temporary) anduser_left(permanent) states.
Frequently Asked Questions
How do I notify the user that messages are pending delivery?
Display a small badge or indicator: "3 messages pending...". Remove it once the buffer is flushed and the server acknowledges receipt. Use a buffered_messages counter in the client state.
What if the client loses internet for 10 minutes?
After 10 minutes of offline, exponential backoff would delay reconnection by up to 60 seconds. The server's grace period (30 seconds by default) expires, and peers see user_left. When the client reconnects, the server treats them as a new user. For apps requiring session recovery after prolonged outages, increase the grace period (but be aware of memory implications).
Can I recover message history for the offline period?
Yes. Track the client's last-seen message ID. When they reconnect, send all messages since that ID: SELECT * FROM messages WHERE id > ? AND room = ?. This reconstructs the conversation and prevents the user from feeling like they missed context.
How do I handle clock skew in reconnection timestamps?
Use server time for grace period calculations, not client time. When the client disconnects, the server records the timestamp; when the client reconnects, the server compares the current time against that stored timestamp, avoiding issues with clock differences.