wireguard-go/device/send.go

547 lines
14 KiB
Go
Raw Normal View History

2019-01-02 00:55:51 +00:00
/* SPDX-License-Identifier: MIT
*
* Copyright (C) 2017-2020 WireGuard LLC. All Rights Reserved.
*/
2019-03-03 03:04:41 +00:00
package device
2017-06-26 11:14:02 +00:00
import (
"bytes"
"encoding/binary"
2017-06-26 11:14:02 +00:00
"net"
"sync"
"sync/atomic"
"time"
2019-05-14 07:09:52 +00:00
"golang.org/x/crypto/chacha20poly1305"
"golang.org/x/net/ipv4"
"golang.org/x/net/ipv6"
2017-06-26 11:14:02 +00:00
)
2017-12-01 22:37:26 +00:00
/* Outbound flow
2017-06-26 11:14:02 +00:00
*
* 1. TUN queue
* 2. Routing (sequential)
* 3. Nonce assignment (sequential)
* 4. Encryption (parallel)
* 5. Transmission (sequential)
2017-06-26 11:14:02 +00:00
*
2017-12-01 22:37:26 +00:00
* The functions in this file occur (roughly) in the order in
* which the packets are processed.
*
* Locking, Producers and Consumers
*
* The order of packets (per peer) must be maintained,
* but encryption of packets happen out-of-order:
*
* The sequential consumers will attempt to take the lock,
* workers release lock when they have completed work (encryption) on the packet.
*
* If the element is inserted into the "encryption queue",
2017-12-01 22:37:26 +00:00
* the content is preceded by enough "junk" to contain the transport header
2017-07-07 11:47:09 +00:00
* (to allow the construction of transport messages in-place)
*/
2017-12-01 22:37:26 +00:00
type QueueOutboundElement struct {
sync.Mutex
buffer *[MaxMessageSize]byte // slice holding the packet data
packet []byte // slice of "buffer" (always!)
nonce uint64 // nonce for encryption
2018-05-13 17:50:58 +00:00
keypair *Keypair // keypair for encryption
peer *Peer // related peer
2017-06-26 11:14:02 +00:00
}
func (device *Device) NewOutboundElement() *QueueOutboundElement {
2018-09-22 04:29:02 +00:00
elem := device.GetOutboundElement()
elem.buffer = device.GetMessageBuffer()
elem.Mutex = sync.Mutex{}
2018-09-22 04:29:02 +00:00
elem.nonce = 0
// keypair and peer were cleared (if necessary) by clearPointers.
return elem
}
// clearPointers clears elem fields that contain pointers.
// This makes the garbage collector's life easier and
// avoids accidentally keeping other objects around unnecessarily.
// It also reduces the possible collateral damage from use-after-free bugs.
func (elem *QueueOutboundElement) clearPointers() {
elem.buffer = nil
elem.packet = nil
2018-09-22 04:29:02 +00:00
elem.keypair = nil
elem.peer = nil
}
func addToNonceQueue(queue chan *QueueOutboundElement, elem *QueueOutboundElement, device *Device) {
for {
select {
case queue <- elem:
return
default:
select {
case old := <-queue:
2018-09-16 22:43:23 +00:00
device.PutMessageBuffer(old.buffer)
2018-09-22 04:29:02 +00:00
device.PutOutboundElement(old)
default:
}
}
}
}
2017-06-26 11:14:02 +00:00
/* Queues a keepalive if no packets are queued for peer
*/
func (peer *Peer) SendKeepalive() bool {
peer.queue.RLock()
defer peer.queue.RUnlock()
if len(peer.queue.nonce) != 0 || peer.queue.packetInNonceQueueIsAwaitingKey.Get() || !peer.isRunning.Get() {
return false
}
elem := peer.device.NewOutboundElement()
elem.packet = nil
select {
case peer.queue.nonce <- elem:
peer.device.log.Debug.Println(peer, "- Sending keepalive packet")
return true
default:
peer.device.PutMessageBuffer(elem.buffer)
2018-09-22 04:29:02 +00:00
peer.device.PutOutboundElement(elem)
return false
}
}
func (peer *Peer) SendHandshakeInitiation(isRetry bool) error {
if !isRetry {
2018-05-20 04:50:07 +00:00
atomic.StoreUint32(&peer.timers.handshakeAttempts, 0)
}
2018-05-13 21:14:43 +00:00
peer.handshake.mutex.RLock()
if time.Since(peer.handshake.lastSentHandshake) < RekeyTimeout {
2018-05-13 21:14:43 +00:00
peer.handshake.mutex.RUnlock()
return nil
}
peer.handshake.mutex.RUnlock()
peer.handshake.mutex.Lock()
if time.Since(peer.handshake.lastSentHandshake) < RekeyTimeout {
2018-05-13 21:14:43 +00:00
peer.handshake.mutex.Unlock()
return nil
}
2018-05-13 21:14:43 +00:00
peer.handshake.lastSentHandshake = time.Now()
peer.handshake.mutex.Unlock()
peer.device.log.Debug.Println(peer, "- Sending handshake initiation")
msg, err := peer.device.CreateMessageInitiation(peer)
if err != nil {
peer.device.log.Error.Println(peer, "- Failed to create initiation message:", err)
return err
}
var buff [MessageInitiationSize]byte
writer := bytes.NewBuffer(buff[:0])
binary.Write(writer, binary.LittleEndian, msg)
packet := writer.Bytes()
2018-05-13 21:14:43 +00:00
peer.cookieGenerator.AddMacs(packet)
peer.timersAnyAuthenticatedPacketTraversal()
peer.timersAnyAuthenticatedPacketSent()
2018-05-13 21:14:43 +00:00
err = peer.SendBuffer(packet)
if err != nil {
peer.device.log.Error.Println(peer, "- Failed to send handshake initiation:", err)
2018-05-13 21:14:43 +00:00
}
peer.timersHandshakeInitiated()
2018-05-13 21:14:43 +00:00
return err
}
func (peer *Peer) SendHandshakeResponse() error {
peer.handshake.mutex.Lock()
peer.handshake.lastSentHandshake = time.Now()
peer.handshake.mutex.Unlock()
peer.device.log.Debug.Println(peer, "- Sending handshake response")
2018-05-13 21:14:43 +00:00
response, err := peer.device.CreateMessageResponse(peer)
if err != nil {
peer.device.log.Error.Println(peer, "- Failed to create response message:", err)
2018-05-13 21:14:43 +00:00
return err
}
var buff [MessageResponseSize]byte
writer := bytes.NewBuffer(buff[:0])
binary.Write(writer, binary.LittleEndian, response)
packet := writer.Bytes()
peer.cookieGenerator.AddMacs(packet)
err = peer.BeginSymmetricSession()
if err != nil {
peer.device.log.Error.Println(peer, "- Failed to derive keypair:", err)
2018-05-13 21:14:43 +00:00
return err
}
peer.timersSessionDerived()
peer.timersAnyAuthenticatedPacketTraversal()
peer.timersAnyAuthenticatedPacketSent()
2018-05-13 21:14:43 +00:00
err = peer.SendBuffer(packet)
if err != nil {
peer.device.log.Error.Println(peer, "- Failed to send handshake response:", err)
2018-05-13 21:14:43 +00:00
}
return err
}
func (device *Device) SendHandshakeCookie(initiatingElem *QueueHandshakeElement) error {
2018-12-18 23:35:53 +00:00
device.log.Debug.Println("Sending cookie response for denied handshake message for", initiatingElem.endpoint.DstToString())
2018-05-13 21:14:43 +00:00
sender := binary.LittleEndian.Uint32(initiatingElem.packet[4:8])
reply, err := device.cookieChecker.CreateReply(initiatingElem.packet, sender, initiatingElem.endpoint.DstToBytes())
if err != nil {
device.log.Error.Println("Failed to create cookie reply:", err)
return err
}
var buff [MessageCookieReplySize]byte
writer := bytes.NewBuffer(buff[:0])
binary.Write(writer, binary.LittleEndian, reply)
device.net.bind.Send(writer.Bytes(), initiatingElem.endpoint)
return nil
}
func (peer *Peer) keepKeyFreshSending() {
2018-05-13 21:14:43 +00:00
keypair := peer.keypairs.Current()
if keypair == nil {
return
}
2018-05-13 21:14:43 +00:00
nonce := atomic.LoadUint64(&keypair.sendNonce)
if nonce > RekeyAfterMessages || (keypair.isInitiator && time.Since(keypair.created) > RekeyAfterTime) {
peer.SendHandshakeInitiation(false)
}
}
/* Reads packets from the TUN and inserts
* into nonce queue for peer
*
* Obs. Single instance per TUN device
*/
2017-08-04 14:15:53 +00:00
func (device *Device) RoutineReadFromTUN() {
logDebug := device.log.Debug
logError := device.log.Error
defer func() {
logDebug.Println("Routine: TUN reader - stopped")
device.state.stopping.Done()
}()
logDebug.Println("Routine: TUN reader - started")
2018-09-22 04:29:02 +00:00
var elem *QueueOutboundElement
for {
2018-09-22 04:29:02 +00:00
if elem != nil {
device.PutMessageBuffer(elem.buffer)
device.PutOutboundElement(elem)
}
elem = device.NewOutboundElement()
2017-06-26 11:14:02 +00:00
2017-08-25 12:53:23 +00:00
// read packet
offset := MessageTransportHeaderSize
size, err := device.tun.device.Read(elem.buffer[:], offset)
if err != nil {
if !device.isClosed.Get() {
logError.Println("Failed to read packet from TUN device:", err)
device.Close()
}
device.PutMessageBuffer(elem.buffer)
2018-09-22 04:29:02 +00:00
device.PutOutboundElement(elem)
return
}
2017-08-25 12:53:23 +00:00
if size == 0 || size > MaxContentSize {
continue
}
2017-06-26 11:14:02 +00:00
elem.packet = elem.buffer[offset : offset+size]
2017-08-04 14:15:53 +00:00
// lookup peer
2017-06-26 11:14:02 +00:00
var peer *Peer
switch elem.packet[0] >> 4 {
case ipv4.Version:
2017-08-04 14:15:53 +00:00
if len(elem.packet) < ipv4.HeaderLen {
continue
}
dst := elem.packet[IPv4offsetDst : IPv4offsetDst+net.IPv4len]
2018-05-13 21:14:43 +00:00
peer = device.allowedips.LookupIPv4(dst)
2017-06-26 11:14:02 +00:00
case ipv6.Version:
2017-08-04 14:15:53 +00:00
if len(elem.packet) < ipv6.HeaderLen {
continue
}
dst := elem.packet[IPv6offsetDst : IPv6offsetDst+net.IPv6len]
2018-05-13 21:14:43 +00:00
peer = device.allowedips.LookupIPv6(dst)
2017-06-26 11:14:02 +00:00
default:
2017-12-01 22:37:26 +00:00
logDebug.Println("Received packet with unknown IP version")
}
if peer == nil {
continue
}
// insert into nonce/pre-handshake queue
peer.queue.RLock()
if peer.isRunning.Get() {
if peer.queue.packetInNonceQueueIsAwaitingKey.Get() {
peer.SendHandshakeInitiation(false)
}
addToNonceQueue(peer.queue.nonce, elem, device)
2018-09-22 04:29:02 +00:00
elem = nil
}
peer.queue.RUnlock()
2017-06-26 11:14:02 +00:00
}
}
func (peer *Peer) FlushNonceQueue() {
select {
case peer.signals.flushNonceQueue <- struct{}{}:
default:
}
}
/* Queues packets when there is no handshake.
* Then assigns nonces to packets sequentially
* and creates "work" structs for workers
2017-06-26 11:14:02 +00:00
*
* Obs. A single instance per peer
2017-06-26 11:14:02 +00:00
*/
func (peer *Peer) RoutineNonce() {
2018-05-13 16:23:40 +00:00
var keypair *Keypair
device := peer.device
2017-07-07 11:47:09 +00:00
logDebug := device.log.Debug
2018-02-04 18:18:44 +00:00
flush := func() {
for {
select {
case elem := <-peer.queue.nonce:
device.PutMessageBuffer(elem.buffer)
2018-09-22 04:29:02 +00:00
device.PutOutboundElement(elem)
default:
return
}
}
}
2018-09-23 23:52:02 +00:00
defer func() {
flush()
logDebug.Println(peer, "- Routine: nonce worker - stopped")
peer.queue.packetInNonceQueueIsAwaitingKey.Set(false)
device: use channel close to shut down and drain encryption channel The new test introduced in this commit used to deadlock about 1% of the time. I believe that the deadlock occurs as follows: * The test completes, calling device.Close. * device.Close closes device.signals.stop. * RoutineEncryption stops. * The deferred function in RoutineEncryption drains device.queue.encryption. * RoutineEncryption exits. * A peer's RoutineNonce processes an element queued in peer.queue.nonce. * RoutineNonce puts that element into the outbound and encryption queues. * RoutineSequentialSender reads that elements from the outbound queue. * It waits for that element to get Unlocked by RoutineEncryption. * RoutineEncryption has already exited, so RoutineSequentialSender blocks forever. * device.RemoveAllPeers calls peer.Stop on all peers. * peer.Stop waits for peer.routines.stopping, which blocks forever. Rather than attempt to add even more ordering to the already complex centralized shutdown orchestration, this commit moves towards a data-flow-oriented shutdown. The device.queue.encryption gets closed when there will be no more writes to it. All device.queue.encryption readers always read until the channel is closed and then exit. We thus guarantee that any element that enters the encryption queue also exits it. This removes the need for central control of the lifetime of RoutineEncryption, removes the need to drain the encryption queue on shutdown, and simplifies RoutineEncryption. This commit also fixes a data race. When RoutineSequentialSender drains its queue on shutdown, it needs to lock the elem before operating on it, just as the main body does. The new test in this commit passed 50k iterations with the race detector enabled and 150k iterations with the race detector disabled, with no failures. Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
2020-12-14 23:07:23 +00:00
device.queue.encryption.wg.Done() // no more writes from us
close(peer.queue.outbound) // no more writes to this channel
2018-09-23 23:52:02 +00:00
peer.routines.stopping.Done()
}()
logDebug.Println(peer, "- Routine: nonce worker - started")
NextPacket:
for {
peer.queue.packetInNonceQueueIsAwaitingKey.Set(false)
select {
case <-peer.routines.stop:
return
case <-peer.signals.flushNonceQueue:
flush()
continue NextPacket
case elem, ok := <-peer.queue.nonce:
if !ok {
return
}
2017-06-26 11:14:02 +00:00
// make sure to always pick the newest key
for {
// check validity of newest key pair
2018-05-13 16:23:40 +00:00
keypair = peer.keypairs.Current()
if keypair != nil && atomic.LoadUint64(&keypair.sendNonce) < RejectAfterMessages {
if time.Since(keypair.created) < RejectAfterTime {
break
}
}
peer.queue.packetInNonceQueueIsAwaitingKey.Set(true)
// no suitable key pair, request for new handshake
select {
case <-peer.signals.newKeypairArrived:
default:
}
peer.SendHandshakeInitiation(false)
2017-11-30 22:22:40 +00:00
// wait for key to be established
logDebug.Println(peer, "- Awaiting keypair")
select {
case <-peer.signals.newKeypairArrived:
logDebug.Println(peer, "- Obtained awaited keypair")
case <-peer.signals.flushNonceQueue:
device.PutMessageBuffer(elem.buffer)
2018-09-22 04:29:02 +00:00
device.PutOutboundElement(elem)
flush()
continue NextPacket
case <-peer.routines.stop:
device.PutMessageBuffer(elem.buffer)
2018-09-22 04:29:02 +00:00
device.PutOutboundElement(elem)
return
}
}
peer.queue.packetInNonceQueueIsAwaitingKey.Set(false)
2017-06-26 11:14:02 +00:00
// populate work element
2017-06-26 11:14:02 +00:00
elem.peer = peer
2018-05-13 16:23:40 +00:00
elem.nonce = atomic.AddUint64(&keypair.sendNonce, 1) - 1
// double check in case of race condition added by future code
if elem.nonce >= RejectAfterMessages {
atomic.StoreUint64(&keypair.sendNonce, RejectAfterMessages)
device.PutMessageBuffer(elem.buffer)
2018-09-22 04:29:02 +00:00
device.PutOutboundElement(elem)
continue NextPacket
}
2018-05-13 16:23:40 +00:00
elem.keypair = keypair
elem.Lock()
// add to parallel and sequential queue
peer.queue.outbound <- elem
device.queue.encryption.c <- elem
2017-06-26 11:14:02 +00:00
}
}
2017-06-26 11:14:02 +00:00
}
func calculatePaddingSize(packetSize, mtu int) int {
lastUnit := packetSize
if mtu == 0 {
return ((lastUnit + PaddingMultiple - 1) & ^(PaddingMultiple - 1)) - lastUnit
}
if lastUnit > mtu {
lastUnit %= mtu
}
paddedSize := ((lastUnit + PaddingMultiple - 1) & ^(PaddingMultiple - 1))
if paddedSize > mtu {
paddedSize = mtu
}
return paddedSize - lastUnit
}
/* Encrypts the elements in the queue
* and marks them for sequential consumption (by releasing the mutex)
*
* Obs. One instance per core
*/
func (device *Device) RoutineEncryption() {
2017-07-17 14:16:18 +00:00
var nonce [chacha20poly1305.NonceSize]byte
2017-07-17 14:16:18 +00:00
logDebug := device.log.Debug
device: use channel close to shut down and drain encryption channel The new test introduced in this commit used to deadlock about 1% of the time. I believe that the deadlock occurs as follows: * The test completes, calling device.Close. * device.Close closes device.signals.stop. * RoutineEncryption stops. * The deferred function in RoutineEncryption drains device.queue.encryption. * RoutineEncryption exits. * A peer's RoutineNonce processes an element queued in peer.queue.nonce. * RoutineNonce puts that element into the outbound and encryption queues. * RoutineSequentialSender reads that elements from the outbound queue. * It waits for that element to get Unlocked by RoutineEncryption. * RoutineEncryption has already exited, so RoutineSequentialSender blocks forever. * device.RemoveAllPeers calls peer.Stop on all peers. * peer.Stop waits for peer.routines.stopping, which blocks forever. Rather than attempt to add even more ordering to the already complex centralized shutdown orchestration, this commit moves towards a data-flow-oriented shutdown. The device.queue.encryption gets closed when there will be no more writes to it. All device.queue.encryption readers always read until the channel is closed and then exit. We thus guarantee that any element that enters the encryption queue also exits it. This removes the need for central control of the lifetime of RoutineEncryption, removes the need to drain the encryption queue on shutdown, and simplifies RoutineEncryption. This commit also fixes a data race. When RoutineSequentialSender drains its queue on shutdown, it needs to lock the elem before operating on it, just as the main body does. The new test in this commit passed 50k iterations with the race detector enabled and 150k iterations with the race detector disabled, with no failures. Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
2020-12-14 23:07:23 +00:00
defer logDebug.Println("Routine: encryption worker - stopped")
logDebug.Println("Routine: encryption worker - started")
2017-07-17 14:16:18 +00:00
device: use channel close to shut down and drain encryption channel The new test introduced in this commit used to deadlock about 1% of the time. I believe that the deadlock occurs as follows: * The test completes, calling device.Close. * device.Close closes device.signals.stop. * RoutineEncryption stops. * The deferred function in RoutineEncryption drains device.queue.encryption. * RoutineEncryption exits. * A peer's RoutineNonce processes an element queued in peer.queue.nonce. * RoutineNonce puts that element into the outbound and encryption queues. * RoutineSequentialSender reads that elements from the outbound queue. * It waits for that element to get Unlocked by RoutineEncryption. * RoutineEncryption has already exited, so RoutineSequentialSender blocks forever. * device.RemoveAllPeers calls peer.Stop on all peers. * peer.Stop waits for peer.routines.stopping, which blocks forever. Rather than attempt to add even more ordering to the already complex centralized shutdown orchestration, this commit moves towards a data-flow-oriented shutdown. The device.queue.encryption gets closed when there will be no more writes to it. All device.queue.encryption readers always read until the channel is closed and then exit. We thus guarantee that any element that enters the encryption queue also exits it. This removes the need for central control of the lifetime of RoutineEncryption, removes the need to drain the encryption queue on shutdown, and simplifies RoutineEncryption. This commit also fixes a data race. When RoutineSequentialSender drains its queue on shutdown, it needs to lock the elem before operating on it, just as the main body does. The new test in this commit passed 50k iterations with the race detector enabled and 150k iterations with the race detector disabled, with no failures. Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
2020-12-14 23:07:23 +00:00
for elem := range device.queue.encryption.c {
// populate header fields
header := elem.buffer[:MessageTransportHeaderSize]
2017-06-26 11:14:02 +00:00
device: use channel close to shut down and drain encryption channel The new test introduced in this commit used to deadlock about 1% of the time. I believe that the deadlock occurs as follows: * The test completes, calling device.Close. * device.Close closes device.signals.stop. * RoutineEncryption stops. * The deferred function in RoutineEncryption drains device.queue.encryption. * RoutineEncryption exits. * A peer's RoutineNonce processes an element queued in peer.queue.nonce. * RoutineNonce puts that element into the outbound and encryption queues. * RoutineSequentialSender reads that elements from the outbound queue. * It waits for that element to get Unlocked by RoutineEncryption. * RoutineEncryption has already exited, so RoutineSequentialSender blocks forever. * device.RemoveAllPeers calls peer.Stop on all peers. * peer.Stop waits for peer.routines.stopping, which blocks forever. Rather than attempt to add even more ordering to the already complex centralized shutdown orchestration, this commit moves towards a data-flow-oriented shutdown. The device.queue.encryption gets closed when there will be no more writes to it. All device.queue.encryption readers always read until the channel is closed and then exit. We thus guarantee that any element that enters the encryption queue also exits it. This removes the need for central control of the lifetime of RoutineEncryption, removes the need to drain the encryption queue on shutdown, and simplifies RoutineEncryption. This commit also fixes a data race. When RoutineSequentialSender drains its queue on shutdown, it needs to lock the elem before operating on it, just as the main body does. The new test in this commit passed 50k iterations with the race detector enabled and 150k iterations with the race detector disabled, with no failures. Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
2020-12-14 23:07:23 +00:00
fieldType := header[0:4]
fieldReceiver := header[4:8]
fieldNonce := header[8:16]
2017-07-02 13:28:38 +00:00
device: use channel close to shut down and drain encryption channel The new test introduced in this commit used to deadlock about 1% of the time. I believe that the deadlock occurs as follows: * The test completes, calling device.Close. * device.Close closes device.signals.stop. * RoutineEncryption stops. * The deferred function in RoutineEncryption drains device.queue.encryption. * RoutineEncryption exits. * A peer's RoutineNonce processes an element queued in peer.queue.nonce. * RoutineNonce puts that element into the outbound and encryption queues. * RoutineSequentialSender reads that elements from the outbound queue. * It waits for that element to get Unlocked by RoutineEncryption. * RoutineEncryption has already exited, so RoutineSequentialSender blocks forever. * device.RemoveAllPeers calls peer.Stop on all peers. * peer.Stop waits for peer.routines.stopping, which blocks forever. Rather than attempt to add even more ordering to the already complex centralized shutdown orchestration, this commit moves towards a data-flow-oriented shutdown. The device.queue.encryption gets closed when there will be no more writes to it. All device.queue.encryption readers always read until the channel is closed and then exit. We thus guarantee that any element that enters the encryption queue also exits it. This removes the need for central control of the lifetime of RoutineEncryption, removes the need to drain the encryption queue on shutdown, and simplifies RoutineEncryption. This commit also fixes a data race. When RoutineSequentialSender drains its queue on shutdown, it needs to lock the elem before operating on it, just as the main body does. The new test in this commit passed 50k iterations with the race detector enabled and 150k iterations with the race detector disabled, with no failures. Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
2020-12-14 23:07:23 +00:00
binary.LittleEndian.PutUint32(fieldType, MessageTransportType)
binary.LittleEndian.PutUint32(fieldReceiver, elem.keypair.remoteIndex)
binary.LittleEndian.PutUint64(fieldNonce, elem.nonce)
device: use channel close to shut down and drain encryption channel The new test introduced in this commit used to deadlock about 1% of the time. I believe that the deadlock occurs as follows: * The test completes, calling device.Close. * device.Close closes device.signals.stop. * RoutineEncryption stops. * The deferred function in RoutineEncryption drains device.queue.encryption. * RoutineEncryption exits. * A peer's RoutineNonce processes an element queued in peer.queue.nonce. * RoutineNonce puts that element into the outbound and encryption queues. * RoutineSequentialSender reads that elements from the outbound queue. * It waits for that element to get Unlocked by RoutineEncryption. * RoutineEncryption has already exited, so RoutineSequentialSender blocks forever. * device.RemoveAllPeers calls peer.Stop on all peers. * peer.Stop waits for peer.routines.stopping, which blocks forever. Rather than attempt to add even more ordering to the already complex centralized shutdown orchestration, this commit moves towards a data-flow-oriented shutdown. The device.queue.encryption gets closed when there will be no more writes to it. All device.queue.encryption readers always read until the channel is closed and then exit. We thus guarantee that any element that enters the encryption queue also exits it. This removes the need for central control of the lifetime of RoutineEncryption, removes the need to drain the encryption queue on shutdown, and simplifies RoutineEncryption. This commit also fixes a data race. When RoutineSequentialSender drains its queue on shutdown, it needs to lock the elem before operating on it, just as the main body does. The new test in this commit passed 50k iterations with the race detector enabled and 150k iterations with the race detector disabled, with no failures. Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
2020-12-14 23:07:23 +00:00
// pad content to multiple of 16
device: use channel close to shut down and drain encryption channel The new test introduced in this commit used to deadlock about 1% of the time. I believe that the deadlock occurs as follows: * The test completes, calling device.Close. * device.Close closes device.signals.stop. * RoutineEncryption stops. * The deferred function in RoutineEncryption drains device.queue.encryption. * RoutineEncryption exits. * A peer's RoutineNonce processes an element queued in peer.queue.nonce. * RoutineNonce puts that element into the outbound and encryption queues. * RoutineSequentialSender reads that elements from the outbound queue. * It waits for that element to get Unlocked by RoutineEncryption. * RoutineEncryption has already exited, so RoutineSequentialSender blocks forever. * device.RemoveAllPeers calls peer.Stop on all peers. * peer.Stop waits for peer.routines.stopping, which blocks forever. Rather than attempt to add even more ordering to the already complex centralized shutdown orchestration, this commit moves towards a data-flow-oriented shutdown. The device.queue.encryption gets closed when there will be no more writes to it. All device.queue.encryption readers always read until the channel is closed and then exit. We thus guarantee that any element that enters the encryption queue also exits it. This removes the need for central control of the lifetime of RoutineEncryption, removes the need to drain the encryption queue on shutdown, and simplifies RoutineEncryption. This commit also fixes a data race. When RoutineSequentialSender drains its queue on shutdown, it needs to lock the elem before operating on it, just as the main body does. The new test in this commit passed 50k iterations with the race detector enabled and 150k iterations with the race detector disabled, with no failures. Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
2020-12-14 23:07:23 +00:00
paddingSize := calculatePaddingSize(len(elem.packet), int(atomic.LoadInt32(&device.tun.mtu)))
for i := 0; i < paddingSize; i++ {
elem.packet = append(elem.packet, 0)
}
2017-07-02 13:28:38 +00:00
device: use channel close to shut down and drain encryption channel The new test introduced in this commit used to deadlock about 1% of the time. I believe that the deadlock occurs as follows: * The test completes, calling device.Close. * device.Close closes device.signals.stop. * RoutineEncryption stops. * The deferred function in RoutineEncryption drains device.queue.encryption. * RoutineEncryption exits. * A peer's RoutineNonce processes an element queued in peer.queue.nonce. * RoutineNonce puts that element into the outbound and encryption queues. * RoutineSequentialSender reads that elements from the outbound queue. * It waits for that element to get Unlocked by RoutineEncryption. * RoutineEncryption has already exited, so RoutineSequentialSender blocks forever. * device.RemoveAllPeers calls peer.Stop on all peers. * peer.Stop waits for peer.routines.stopping, which blocks forever. Rather than attempt to add even more ordering to the already complex centralized shutdown orchestration, this commit moves towards a data-flow-oriented shutdown. The device.queue.encryption gets closed when there will be no more writes to it. All device.queue.encryption readers always read until the channel is closed and then exit. We thus guarantee that any element that enters the encryption queue also exits it. This removes the need for central control of the lifetime of RoutineEncryption, removes the need to drain the encryption queue on shutdown, and simplifies RoutineEncryption. This commit also fixes a data race. When RoutineSequentialSender drains its queue on shutdown, it needs to lock the elem before operating on it, just as the main body does. The new test in this commit passed 50k iterations with the race detector enabled and 150k iterations with the race detector disabled, with no failures. Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
2020-12-14 23:07:23 +00:00
// encrypt content and release to consumer
device: use channel close to shut down and drain encryption channel The new test introduced in this commit used to deadlock about 1% of the time. I believe that the deadlock occurs as follows: * The test completes, calling device.Close. * device.Close closes device.signals.stop. * RoutineEncryption stops. * The deferred function in RoutineEncryption drains device.queue.encryption. * RoutineEncryption exits. * A peer's RoutineNonce processes an element queued in peer.queue.nonce. * RoutineNonce puts that element into the outbound and encryption queues. * RoutineSequentialSender reads that elements from the outbound queue. * It waits for that element to get Unlocked by RoutineEncryption. * RoutineEncryption has already exited, so RoutineSequentialSender blocks forever. * device.RemoveAllPeers calls peer.Stop on all peers. * peer.Stop waits for peer.routines.stopping, which blocks forever. Rather than attempt to add even more ordering to the already complex centralized shutdown orchestration, this commit moves towards a data-flow-oriented shutdown. The device.queue.encryption gets closed when there will be no more writes to it. All device.queue.encryption readers always read until the channel is closed and then exit. We thus guarantee that any element that enters the encryption queue also exits it. This removes the need for central control of the lifetime of RoutineEncryption, removes the need to drain the encryption queue on shutdown, and simplifies RoutineEncryption. This commit also fixes a data race. When RoutineSequentialSender drains its queue on shutdown, it needs to lock the elem before operating on it, just as the main body does. The new test in this commit passed 50k iterations with the race detector enabled and 150k iterations with the race detector disabled, with no failures. Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
2020-12-14 23:07:23 +00:00
binary.LittleEndian.PutUint64(nonce[4:], elem.nonce)
elem.packet = elem.keypair.send.Seal(
header,
nonce[:],
elem.packet,
nil,
)
elem.Unlock()
}
}
/* Sequentially reads packets from queue and sends to endpoint
*
* Obs. Single instance per peer.
* The routine terminates then the outbound queue is closed.
*/
func (peer *Peer) RoutineSequentialSender() {
device := peer.device
2017-07-08 21:51:26 +00:00
logDebug := device.log.Debug
logError := device.log.Error
2017-07-08 21:51:26 +00:00
defer logDebug.Println(peer, "- Routine: sequential sender - stopped")
logDebug.Println(peer, "- Routine: sequential sender - started")
for elem := range peer.queue.outbound {
elem.Lock()
if !peer.isRunning.Get() {
// peer has been stopped; return re-usable elems to the shared pool.
// This is an optimization only. It is possible for the peer to be stopped
// immediately after this check, in which case, elem will get processed.
// The timers and SendBuffer code are resilient to a few stragglers.
// TODO(josharian): rework peer shutdown order to ensure
// that we never accidentally keep timers alive longer than necessary.
2017-07-27 21:45:37 +00:00
device.PutMessageBuffer(elem.buffer)
2018-09-22 04:29:02 +00:00
device.PutOutboundElement(elem)
continue
}
peer.timersAnyAuthenticatedPacketTraversal()
peer.timersAnyAuthenticatedPacketSent()
// send message and return buffer to pool
err := peer.SendBuffer(elem.packet)
if len(elem.packet) != MessageKeepaliveSize {
peer.timersDataSent()
}
device.PutMessageBuffer(elem.buffer)
device.PutOutboundElement(elem)
if err != nil {
logError.Println(peer, "- Failed to send data packet", err)
continue
}
peer.keepKeyFreshSending()
2017-06-26 11:14:02 +00:00
}
}