[PATCH v2] audit: improve audit queue handling when "audit=1" on cmdline

Fri Jan 21 20:34:22 UTC 2022

On 2022-01-20 11:47, Paul Moore wrote:
> When an admin enables audit at early boot via the "audit=1" kernel
> command line the audit queue behavior is slightly different; the
> audit subsystem goes to greater lengths to avoid dropping records,
> which unfortunately can result in problems when the audit daemon is
> forcibly stopped for an extended period of time.
> 
> This patch makes a number of changes designed to improve the audit
> queuing behavior so that leaving the audit daemon in a stopped state
> for an extended period does not cause a significant impact to the
> system.
> 
> - kauditd_send_queue() is now limited to looping through the
>   passed queue only once per call.  This not only prevents the
>   function from looping indefinitely when records are returned
>   to the current queue, it also allows any recovery handling in
>   kauditd_thread() to take place when kauditd_send_queue()
>   returns.
> 
> - Transient netlink send errors seen as -EAGAIN now cause the
>   record to be returned to the retry queue instead of going to
>   the hold queue.  The intention of the hold queue is to store,
>   perhaps for an extended period of time, the events which led
>   up to the audit daemon going offline.  The retry queue remains
>   a temporary queue intended to protect against transient issues
>   between the kernel and the audit daemon.
> 
> - The retry queue is now limited by the audit_backlog_limit
>   setting, the same as the other queues.  This allows admins
>   to bound the size of all of the audit queues on the system.
> 
> - kauditd_rehold_skb() now returns records to the end of the
>   hold queue to ensure ordering is preserved in the face of
>   recent changes to kauditd_send_queue().
> 
> Cc: stable at vger.kernel.org
> Fixes: 5b52330bbfe63 ("audit: fix auditd/kernel connection state tracking")
> Fixes: f4b3ee3c85551 ("audit: improve robustness of the audit queue handling")
> Reported-by: Gaosheng Cui <cuigaosheng1 at huawei.com>
> Signed-off-by: Paul Moore <paul at paul-moore.com>

Looks reasonable to me.
Reviewed-by: Richard Guy Briggs <rgb at redhat.com>

> --
> v2:
> - incorporated feedback from Gaosheng Cui
> - promoted to proper patch
> v1:
> - initial RFC
> ---
>  kernel/audit.c |   62 +++++++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 43 insertions(+), 19 deletions(-)
> 
> diff --git a/kernel/audit.c b/kernel/audit.c
> index 4cebadb5f30d..60f8a77a6c83 100644
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -541,20 +541,22 @@ static void kauditd_printk_skb(struct sk_buff *skb)
>  /**
>   * kauditd_rehold_skb - Handle a audit record send failure in the hold queue
>   * @skb: audit record
> + * @error: error code (unused)
>   *
>   * Description:
>   * This should only be used by the kauditd_thread when it fails to flush the
>   * hold queue.
>   */
> -static void kauditd_rehold_skb(struct sk_buff *skb)
> +static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error)
>  {
> -	/* put the record back in the queue at the same place */
> -	skb_queue_head(&audit_hold_queue, skb);
> +	/* put the record back in the queue */
> +	skb_queue_tail(&audit_hold_queue, skb);
>  }
>  
>  /**
>   * kauditd_hold_skb - Queue an audit record, waiting for auditd
>   * @skb: audit record
> + * @error: error code
>   *
>   * Description:
>   * Queue the audit record, waiting for an instance of auditd.  When this
> @@ -564,19 +566,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb)
>   * and queue it, if we have room.  If we want to hold on to the record, but we
>   * don't have room, record a record lost message.
>   */
> -static void kauditd_hold_skb(struct sk_buff *skb)
> +static void kauditd_hold_skb(struct sk_buff *skb, int error)
>  {
>  	/* at this point it is uncertain if we will ever send this to auditd so
>  	 * try to send the message via printk before we go any further */
>  	kauditd_printk_skb(skb);
>  
>  	/* can we just silently drop the message? */
> -	if (!audit_default) {
> -		kfree_skb(skb);
> -		return;
> +	if (!audit_default)
> +		goto drop;
> +
> +	/* the hold queue is only for when the daemon goes away completely,
> +	 * not -EAGAIN failures; if we are in a -EAGAIN state requeue the
> +	 * record on the retry queue unless it's full, in which case drop it
> +	 */
> +	if (error == -EAGAIN) {
> +		if (!audit_backlog_limit ||
> +		    skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
> +			skb_queue_tail(&audit_retry_queue, skb);
> +			return;
> +		}
> +		audit_log_lost("kauditd retry queue overflow");
> +		goto drop;
>  	}
>  
> -	/* if we have room, queue the message */
> +	/* if we have room in the hold queue, queue the message */
>  	if (!audit_backlog_limit ||
>  	    skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
>  		skb_queue_tail(&audit_hold_queue, skb);
> @@ -585,24 +599,32 @@ static void kauditd_hold_skb(struct sk_buff *skb)
>  
>  	/* we have no other options - drop the message */
>  	audit_log_lost("kauditd hold queue overflow");
> +drop:
>  	kfree_skb(skb);
>  }
>  
>  /**
>   * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
>   * @skb: audit record
> + * @error: error code (unused)
>   *
>   * Description:
>   * Not as serious as kauditd_hold_skb() as we still have a connected auditd,
>   * but for some reason we are having problems sending it audit records so
>   * queue the given record and attempt to resend.
>   */
> -static void kauditd_retry_skb(struct sk_buff *skb)
> +static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error)
>  {
> -	/* NOTE: because records should only live in the retry queue for a
> -	 * short period of time, before either being sent or moved to the hold
> -	 * queue, we don't currently enforce a limit on this queue */
> -	skb_queue_tail(&audit_retry_queue, skb);
> +	if (!audit_backlog_limit ||
> +	    skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
> +		skb_queue_tail(&audit_retry_queue, skb);
> +		return;
> +	}
> +
> +	/* we have to drop the record, send it via printk as a last effort */
> +	kauditd_printk_skb(skb);
> +	audit_log_lost("kauditd retry queue overflow");
> +	kfree_skb(skb);
>  }
>  
>  /**
> @@ -640,7 +662,7 @@ static void auditd_reset(const struct auditd_connection *ac)
>  	/* flush the retry queue to the hold queue, but don't touch the main
>  	 * queue since we need to process that normally for multicast */
>  	while ((skb = skb_dequeue(&audit_retry_queue)))
> -		kauditd_hold_skb(skb);
> +		kauditd_hold_skb(skb, -ECONNREFUSED);
>  }
>  
>  /**
> @@ -714,16 +736,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
>  			      struct sk_buff_head *queue,
>  			      unsigned int retry_limit,
>  			      void (*skb_hook)(struct sk_buff *skb),
> -			      void (*err_hook)(struct sk_buff *skb))
> +			      void (*err_hook)(struct sk_buff *skb, int error))
>  {
>  	int rc = 0;
> -	struct sk_buff *skb;
> +	struct sk_buff *skb = NULL;
> +	struct sk_buff *skb_tail;
>  	unsigned int failed = 0;
>  
>  	/* NOTE: kauditd_thread takes care of all our locking, we just use
>  	 *       the netlink info passed to us (e.g. sk and portid) */
>  
> -	while ((skb = skb_dequeue(queue))) {
> +	skb_tail = skb_peek_tail(queue);
> +	while ((skb != skb_tail) && (skb = skb_dequeue(queue))) {
>  		/* call the skb_hook for each skb we touch */
>  		if (skb_hook)
>  			(*skb_hook)(skb);
> @@ -731,7 +755,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
>  		/* can we send to anyone via unicast? */
>  		if (!sk) {
>  			if (err_hook)
> -				(*err_hook)(skb);
> +				(*err_hook)(skb, -ECONNREFUSED);
>  			continue;
>  		}
>  
> @@ -745,7 +769,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
>  			    rc == -ECONNREFUSED || rc == -EPERM) {
>  				sk = NULL;
>  				if (err_hook)
> -					(*err_hook)(skb);
> +					(*err_hook)(skb, rc);
>  				if (rc == -EAGAIN)
>  					rc = 0;
>  				/* continue to drain the queue */
> 
> --
> Linux-audit mailing list
> Linux-audit at redhat.com
> https://listman.redhat.com/mailman/listinfo/linux-audit
> 

- RGB

--
Richard Guy Briggs <rgb at redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635