Kea 3.2.0-git
pkt_filter_lpf.cc
Go to the documentation of this file.
1// Copyright (C) 2013-2026 Internet Systems Consortium, Inc. ("ISC")
2//
3// This Source Code Form is subject to the terms of the Mozilla Public
4// License, v. 2.0. If a copy of the MPL was not distributed with this
5// file, You can obtain one at http://mozilla.org/MPL/2.0/.
6
7#include <config.h>
8#include <dhcp/dhcp4.h>
9#include <dhcp/iface_mgr.h>
10#include <dhcp/pkt4.h>
11#include <dhcp/pkt_filter_lpf.h>
12#include <dhcp/protocol_util.h>
14#include <fcntl.h>
15#include <net/ethernet.h>
16#include <linux/filter.h>
17#include <linux/if_ether.h>
18#include <linux/if_packet.h>
19
20namespace {
21
22using namespace isc::dhcp;
23
46struct sock_filter dhcp_sock_filter [] = {
47 // Make sure this is an IP packet: check the half-word (two bytes)
48 // at offset 12 in the packet (the Ethernet packet type). If it
49 // is, advance to the next instruction. If not, advance 13
50 // instructions (which takes execution to the last instruction in
51 // the sequence: "drop it").
52 // #0
53 BPF_STMT(BPF_LD + BPF_H + BPF_ABS, ETHERNET_PACKET_TYPE_OFFSET),
54 // #1
55 BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ETHERTYPE_IP, 0, 13),
56
57 // Make sure it's a UDP packet. The IP protocol is at offset
58 // 9 in the IP header so, adding the Ethernet packet header size
59 // of 14 bytes gives an absolute byte offset in the packet of 23.
60 // #2
61 BPF_STMT(BPF_LD + BPF_B + BPF_ABS,
62 ETHERNET_HEADER_LEN + IP_PROTO_TYPE_OFFSET),
63 // #3
64 BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_UDP, 0, 11),
65
66 // Make sure this isn't a fragment by checking that the fragment
67 // offset field in the IP header is zero. This field is the
68 // least-significant 13 bits in the bytes at offsets 6 and 7 in
69 // the IP header, so the half-word at offset 20 (6 + size of
70 // Ethernet header) is loaded and an appropriate mask applied.
71 // #4
72 BPF_STMT(BPF_LD + BPF_H + BPF_ABS, ETHERNET_HEADER_LEN + IP_FLAGS_OFFSET),
73 // #5
74 BPF_JUMP(BPF_JMP + BPF_JSET + BPF_K, 0x1fff, 9, 0),
75
76 // Check the packet's destination address. The program will only
77 // allow the packets sent to the broadcast address or unicast
78 // to the specific address on the interface. By default, this
79 // address is set to 0 and must be set to the specific value
80 // when the raw socket is created and the program is attached
81 // to it. The caller must assign the address to the
82 // prog.bf_insns[8].k in the network byte order.
83 // #6
84 BPF_STMT(BPF_LD + BPF_W + BPF_ABS,
85 ETHERNET_HEADER_LEN + IP_DEST_ADDR_OFFSET),
86 // If this is a broadcast address, skip the next check.
87 // #7
88 BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0xffffffff, 1, 0),
89 // If this is not broadcast address, compare it with the unicast
90 // address specified for the interface.
91 // #8
92 BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0x00000000, 0, 6),
93
94 // Get the IP header length. This is achieved by the following
95 // (special) instruction that, given the offset of the start
96 // of the IP header (offset 14) loads the IP header length.
97 // #9
98 BPF_STMT(BPF_LDX + BPF_B + BPF_MSH, ETHERNET_HEADER_LEN),
99
100 // Make sure it's to the right port. The following instruction
101 // adds the previously extracted IP header length to the given
102 // offset to locate the correct byte. The given offset of 16
103 // comprises the length of the Ethernet header (14) plus the offset
104 // of the UDP destination port (2) within the UDP header.
105 // #10
106 BPF_STMT(BPF_LD + BPF_H + BPF_IND, ETHERNET_HEADER_LEN + UDP_DEST_PORT),
107 // The following instruction tests against the default DHCP server port,
108 // but the action port is actually set in PktFilterBPF::openSocket().
109 // N.B. The code in that method assumes that this instruction is at
110 // offset 11 in the program. If this is changed, openSocket() must be
111 // updated.
112 // #11
113 BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, DHCP4_SERVER_PORT, 0, 3),
114
115 // Make sure this packet does not contain a vlan tag. The tag is stripped
116 // automatically by the kernel when presented to the vlan interface.
117 // Parent interface should not see this packet.
118 // #12
119 BPF_STMT(BPF_LD + BPF_B + BPF_ABS, (u_int)SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT),
120
121 // If this packet contains vlan tag drop the packet.
122 // #13
123 BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0, 0, 1),
124
125 // If we passed all the tests, ask for the whole packet.
126 // #14
127 BPF_STMT(BPF_RET + BPF_K, (u_int)-1),
128
129 // Otherwise, drop it.
130 // #15
131 BPF_STMT(BPF_RET + BPF_K, 0),
132};
133
134}
135
136using namespace isc::util;
137
138namespace isc {
139namespace dhcp {
140
141bool
143#ifdef SO_TIMESTAMP
144 return (true);
145#else
146 return (false);
147#endif
148}
149
152 const isc::asiolink::IOAddress& addr,
153 const uint16_t port, const bool,
154 const bool) {
155 // Open fallback socket first. If it fails, it will give us an indication
156 // that there is another service (perhaps DHCP server) running.
157 // The function will throw an exception and effectively cease opening
158 // raw socket below.
159 int fallback = openFallbackSocket(addr, port);
160
161 // The fallback is open, so we are good to open primary socket.
162 int sock = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
163 if (sock < 0) {
164 close(fallback);
165 isc_throw(SocketConfigError, "Failed to create raw LPF socket");
166 }
167
168 // Set the close-on-exec flag.
169 if (fcntl(sock, F_SETFD, FD_CLOEXEC) < 0) {
170 close(sock);
171 close(fallback);
172 isc_throw(SocketConfigError, "Failed to set close-on-exec flag"
173 << " on the socket " << sock);
174 }
175
176 int enable = 1;
177 // Enable ancillary data to detect VLAN tagged packets.
178 if (setsockopt(sock, SOL_PACKET, PACKET_AUXDATA, &enable, sizeof(enable))) {
179 const char* errmsg = strerror(errno);
180 isc_throw(SocketConfigError, "Could not enable PACKET_AUXDATA for " << addr.toText()
181 << ", error: " << errmsg);
182 }
183
184#ifdef SO_TIMESTAMP
185 if (setsockopt(sock, SOL_SOCKET, SO_TIMESTAMP, &enable, sizeof(enable))) {
186 const char* errmsg = strerror(errno);
187 isc_throw(SocketConfigError, "Could not enable SO_TIMESTAMP for " << addr.toText()
188 << ", error: " << errmsg);
189 }
190#endif
191
192 struct sockaddr_ll sa;
193 memset(&sa, 0, sizeof(sockaddr_ll));
194 sa.sll_family = AF_PACKET;
195 sa.sll_ifindex = iface.getIndex();
196
197 // For raw sockets we construct IP headers on our own, so we don't bind
198 // socket to IP address but to the interface. We will later use the
199 // Linux Packet Filtering to filter out these packets that we are
200 // interested in.
201 if (bind(sock, reinterpret_cast<const struct sockaddr*>(&sa),
202 sizeof(sa)) < 0) {
203 close(sock);
204 close(fallback);
205 isc_throw(SocketConfigError, "Failed to bind LPF socket '" << sock
206 << "' to interface '" << iface.getName() << "'");
207 }
208
209 // Set socket to non-blocking mode.
210 if (fcntl(sock, F_SETFL, O_NONBLOCK) != 0) {
211 // Get the error message immediately after the bind because the
212 // invocation to close() below would override the errno.
213 char* errmsg = strerror(errno);
214 close(sock);
215 close(fallback);
216 isc_throw(SocketConfigError, "failed to set SO_NONBLOCK option on the"
217 " LPF socket '" << sock << "' to interface '"
218 << iface.getName() << "', reason: " << errmsg);
219 }
220
221 struct sock_filter zero_filter[] = { BPF_STMT(BPF_RET + BPF_K, 0) };
222 struct sock_fprog zero_filter_program;
223 memset(&zero_filter_program, 0, sizeof(zero_filter_program));
224
225 zero_filter_program.filter = zero_filter;
226 zero_filter_program.len = sizeof(zero_filter) / sizeof(struct sock_filter);
227
228 // Apply the filter.
229 if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &zero_filter_program,
230 sizeof(zero_filter_program)) < 0) {
231 close(sock);
232 close(fallback);
233 isc_throw(SocketConfigError, "Failed to install zero packet filtering program"
234 << " on the socket " << sock);
235 }
236
237 int datalen;
238 uint8_t data;
239 // Non-DHCP packets may have been received before the filter was attached,
240 // so drain the socket.
241 do {
242 datalen = recv(sock, &data, sizeof(data), 0);
243 } while (datalen > 0);
244
245 // Create socket filter program. This program will only allow incoming UDP
246 // traffic which arrives on the specific (DHCP) port). It will also filter
247 // out all fragmented packets.
248 struct sock_fprog filter_program;
249 memset(&filter_program, 0, sizeof(filter_program));
250
251 filter_program.filter = dhcp_sock_filter;
252 filter_program.len = sizeof(dhcp_sock_filter) / sizeof(struct sock_filter);
253
254 // Configure the filter program to receive unicast packets sent to the
255 // specified address. The program will also allow packets sent to the
256 // 255.255.255.255 broadcast address.
257 dhcp_sock_filter[8].k = addr.toUint32();
258
259 // Override the default port value.
260 dhcp_sock_filter[11].k = port;
261
262 // Apply the filter.
263 if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &filter_program,
264 sizeof(filter_program)) < 0) {
265 close(sock);
266 close(fallback);
267 isc_throw(SocketConfigError, "Failed to install packet filtering program"
268 << " on the socket " << sock);
269 }
270
271 return (SocketInfo(addr, port, sock, fallback));
272}
273
275PktFilterLPF::receive(Iface& iface, const SocketInfo& socket_info) {
276 uint8_t raw_buf[IfaceMgr::RCVBUFSIZE];
277 // First let's get some data from the fallback socket. The data will be
278 // discarded but we don't want the socket buffer to bloat. We get the
279 // packets from the socket in loop but most of the time the loop will
280 // end after receiving one packet. The call to recv returns immediately
281 // when there is no data left on the socket because the socket is
282 // non-blocking.
283 // @todo In the normal conditions, both the primary socket and the fallback
284 // socket are in sync as they are set to receive packets on the same
285 // address and port. The reception of packets on the fallback socket
286 // shouldn't cause significant lags in packet reception. If we find in the
287 // future that it does, the sort of threshold could be set for the maximum
288 // bytes received on the fallback socket in a single round. Further
289 // optimizations would include an asynchronous read from the fallback socket
290 // when the DHCP server is idle.
291 int datalen;
292 do {
293 datalen = recv(socket_info.fallbackfd_, raw_buf, sizeof(raw_buf), 0);
294 } while (datalen > 0);
295
296#ifndef SO_TIMESTAMP
297 // Now that we finished getting data from the fallback socket, we
298 // have to get the data from the raw socket too.
299 int data_len = read(socket_info.sockfd_, raw_buf, sizeof(raw_buf));
300 // If negative value is returned by read(), it indicates that an
301 // error occurred. If returned value is 0, no data was read from the
302 // socket. In both cases something has gone wrong, because we expect
303 // that a chunk of data is there. We signal the lack of data by
304 // returning an empty packet.
305 if (data_len <= 0) {
306 return Pkt4Ptr();
307 }
308
309 InputBuffer buf(raw_buf, data_len);
310#else
311 const size_t CONTROL_BUF_LEN = 512;
312 uint8_t msg_buf[IfaceMgr::RCVBUFSIZE];
313 uint8_t control_buf[CONTROL_BUF_LEN];
314
315 memset(&control_buf[0], 0, CONTROL_BUF_LEN);
316
317 // Initialize our message header structure.
318 struct msghdr m;
319 memset(&m, 0, sizeof(m));
320
321 struct iovec v;
322 v.iov_base = static_cast<void*>(msg_buf);
323 v.iov_len = IfaceMgr::RCVBUFSIZE;
324 m.msg_iov = &v;
325 m.msg_iovlen = 1;
326
327 // Getting the interface is a bit more involved.
328 //
329 // We set up some space for a "control message". We have
330 // previously asked the kernel to give us packet
331 // information (when we initialized the interface), so we
332 // should get the destination address from that.
333 m.msg_control = &control_buf[0];
334 m.msg_controllen = CONTROL_BUF_LEN;
335
336 int result = recvmsg(socket_info.sockfd_, &m, 0);
337 if (result < 0) {
338 isc_throw(SocketReadError, "Pkt4FilterLpf to receive UDP4 data");
339 }
340
341 InputBuffer buf(msg_buf, result);
342#endif
343
344 // @todo: This is awkward way to solve the chicken and egg problem
345 // whereby we don't know the offset where DHCP data start in the
346 // received buffer when we create the packet object. In general case,
347 // the IP header has variable length. The information about its length
348 // is stored in one of its fields. Therefore, we have to decode the
349 // packet to get the offset of the DHCP data. The dummy object is
350 // created so as we can pass it to the functions which decode IP stack
351 // and find actual offset of the DHCP data.
352 // Once we find the offset we can create another Pkt4 object from
353 // the reminder of the input buffer and set the IP addresses and
354 // ports from the dummy packet. We should consider doing it
355 // in some more elegant way.
356 Pkt4Ptr dummy_pkt = Pkt4Ptr(new Pkt4(DHCPDISCOVER, 0));
357
358 // Decode ethernet, ip and udp headers.
359 decodeEthernetHeader(buf, dummy_pkt);
360 decodeIpUdpHeader(buf, dummy_pkt);
361
362 auto v4_len = buf.getLength() - buf.getPosition();
363 if (v4_len <= 0) {
364 isc_throw(SocketReadError, "Pkt4FilterLpf packet has no DHCPv4 data");
365 }
366
367 // Read the DHCP data.
368 std::vector<uint8_t> dhcp_buf;
369 buf.readVector(dhcp_buf, v4_len);
370
371 // Decode DHCP data into the Pkt4 object.
372 Pkt4Ptr pkt = Pkt4Ptr(new Pkt4(&dhcp_buf[0], dhcp_buf.size()));
373
374 // Set the appropriate packet members using data collected from
375 // the decoded headers.
376 pkt->setIndex(iface.getIndex());
377 pkt->setIface(iface.getName());
378 pkt->setLocalAddr(dummy_pkt->getLocalAddr());
379 pkt->setRemoteAddr(dummy_pkt->getRemoteAddr());
380 pkt->setLocalPort(dummy_pkt->getLocalPort());
381 pkt->setRemotePort(dummy_pkt->getRemotePort());
382 pkt->setLocalHWAddr(dummy_pkt->getLocalHWAddr());
383 pkt->setRemoteHWAddr(dummy_pkt->getRemoteHWAddr());
384
385#ifdef SO_TIMESTAMP
386 struct cmsghdr* cmsg = CMSG_FIRSTHDR(&m);
387 while (cmsg != NULL) {
388 if ((cmsg->cmsg_level == SOL_SOCKET) &&
389 (cmsg->cmsg_type == SCM_TIMESTAMP)) {
390
391 struct timeval cmsg_time;
392 memcpy(&cmsg_time, CMSG_DATA(cmsg), sizeof(cmsg_time));
393 pkt->addPktEvent(PktEvent::SOCKET_RECEIVED, cmsg_time);
394 break;
395 }
396
397 cmsg = CMSG_NXTHDR(&m, cmsg);
398 }
399#endif
400
401 // Set time packet was read from the buffer.
402 pkt->addPktEvent(PktEvent::BUFFER_READ);
403
404 return (pkt);
405}
406
407int
408PktFilterLPF::send(const Iface& iface, uint16_t sockfd, const Pkt4Ptr& pkt) {
409
410 OutputBuffer buf(14);
411
412 // Some interfaces may have no HW address - e.g. loopback interface.
413 // For these interfaces the HW address length is 0. If this is the case,
414 // then we will rely on the functions which construct the IP/UDP headers
415 // to provide a default HW addres. Otherwise, create the HW address
416 // object using the HW address of the interface.
417 if (iface.getMacLen() > 0) {
418 HWAddrPtr hwaddr(new HWAddr(iface.getMac(), iface.getMacLen(),
419 iface.getHWType()));
420 pkt->setLocalHWAddr(hwaddr);
421 }
422
423
424 // Ethernet frame header.
425 // Note that we don't validate whether HW addresses in 'pkt'
426 // are valid because they are checked by the function called.
427 writeEthernetHeader(pkt, buf);
428
429 // IP and UDP header
430 writeIpUdpHeader(pkt, buf);
431
432 // DHCPv4 message
433 buf.writeData(pkt->getBuffer().getData(), pkt->getBuffer().getLength());
434
435 sockaddr_ll sa;
436 memset(&sa, 0x0, sizeof(sa));
437 sa.sll_family = AF_PACKET;
438 sa.sll_ifindex = iface.getIndex();
439 sa.sll_protocol = htons(ETH_P_IP);
440 sa.sll_halen = 6;
441
442 pkt->addPktEvent(PktEvent::RESPONSE_SENT);
443 int result = sendto(sockfd, buf.getData(), buf.getLength(), 0,
444 reinterpret_cast<const struct sockaddr*>(&sa),
445 sizeof(sockaddr_ll));
446 if (result < 0) {
447 isc_throw(SocketWriteError, "failed to send DHCPv4 packet, errno="
448 << errno << " (check errno.h)");
449 }
450
451 return (0);
452
453}
454
455} // end of isc::dhcp namespace
456} // end of isc namespace
static const uint32_t RCVBUFSIZE
Packet reception buffer size.
Definition iface_mgr.h:807
Represents a single network interface.
Definition iface_mgr.h:136
size_t getMacLen() const
Returns MAC length.
Definition iface_mgr.h:233
std::string getName() const
Returns interface name.
Definition iface_mgr.h:264
uint16_t getHWType() const
Returns hardware type of the interface.
Definition iface_mgr.h:278
unsigned int getIndex() const
Returns interface index.
Definition iface_mgr.h:257
const uint8_t * getMac() const
Returns pointer to MAC address.
Definition iface_mgr.h:241
Represents DHCPv4 packet.
Definition pkt4.h:37
static const std::string BUFFER_READ
Event that marks when a packet is read from the socket buffer by application.
Definition pkt.h:97
static const std::string SOCKET_RECEIVED
Event that marks when a packet is placed in the socket buffer by the kernel.
Definition pkt.h:93
static const std::string RESPONSE_SENT
Event that marks when a packet is been written to the socket by application.
Definition pkt.h:101
virtual bool isSocketReceivedTimeSupported() const
Check if the socket received time is supported.
virtual int send(const Iface &iface, uint16_t sockfd, const Pkt4Ptr &pkt)
Send packet over specified socket.
virtual SocketInfo openSocket(Iface &iface, const isc::asiolink::IOAddress &addr, const uint16_t port, const bool receive_bcast, const bool send_bcast)
Open primary and fallback socket.
virtual Pkt4Ptr receive(Iface &iface, const SocketInfo &socket_info)
Receive packet over specified socket.
virtual int openFallbackSocket(const isc::asiolink::IOAddress &addr, const uint16_t port)
Default implementation to open a fallback socket.
Definition pkt_filter.cc:18
IfaceMgr exception thrown thrown when socket opening or configuration failed.
Definition iface_mgr.h:66
IfaceMgr exception thrown thrown when error occurred during reading data from socket.
Definition iface_mgr.h:74
IfaceMgr exception thrown thrown when error occurred during sending data through socket.
Definition iface_mgr.h:82
The InputBuffer class is a buffer abstraction for manipulating read-only data.
Definition buffer.h:81
void readVector(std::vector< uint8_t > &data, size_t len)
Read specified number of bytes as a vector.
Definition buffer.h:262
size_t getPosition() const
Return the current read position.
Definition buffer.h:101
size_t getLength() const
Return the length of the data stored in the buffer.
Definition buffer.h:96
The OutputBuffer class is a buffer abstraction for manipulating mutable data.
Definition buffer.h:346
void writeData(const void *data, size_t len)
Copy an arbitrary length of data into the buffer.
Definition buffer.h:559
const uint8_t * getData() const
Return a pointer to the head of the data stored in the buffer.
Definition buffer.h:398
size_t getLength() const
Return the length of data written in the buffer.
Definition buffer.h:412
#define isc_throw(type, stream)
A shortcut macro to insert known values into exception arguments.
boost::shared_ptr< Pkt4 > Pkt4Ptr
A pointer to Pkt4 object.
Definition pkt4.h:556
void decodeEthernetHeader(InputBuffer &buf, Pkt4Ptr &pkt)
Decode the Ethernet header.
void writeEthernetHeader(const Pkt4Ptr &pkt, OutputBuffer &out_buf)
Writes ethernet frame header into a buffer.
void decodeIpUdpHeader(InputBuffer &buf, Pkt4Ptr &pkt)
Decode IP and UDP header.
boost::shared_ptr< HWAddr > HWAddrPtr
Shared pointer to a hardware address structure.
Definition hwaddr.h:154
@ DHCPDISCOVER
Definition dhcp4.h:235
void writeIpUdpHeader(const Pkt4Ptr &pkt, util::OutputBuffer &out_buf)
Writes both IP and UDP header into output buffer.
Defines the logger used by the top-level component of kea-lfc.
Hardware type that represents information from DHCPv4 packet.
Definition hwaddr.h:20
Holds information about socket.
Definition socket_info.h:18
int sockfd_
Socket descriptor (a.k.a. primary socket).
Definition socket_info.h:30
int fallbackfd_
Fallback socket descriptor.
Definition socket_info.h:54