File: //proc/self/root/usr/sbin/cachestat-bpfcc
#! /usr/bin/python3
#
# cachestat     Count cache kernel function calls.
#               For Linux, uses BCC, eBPF. See .c file.
#
# USAGE: cachestat
# Taken from funccount by Brendan Gregg
# This is a rewrite of cachestat from perf to bcc
# https://github.com/brendangregg/perf-tools/blob/master/fs/cachestat
#
# Copyright (c) 2016 Allan McAleavy.
# Copyright (c) 2015 Brendan Gregg.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 09-Sep-2015   Brendan Gregg   Created this.
# 06-Nov-2015   Allan McAleavy
# 13-Jan-2016   Allan McAleavy  run pep8 against program
# 02-Feb-2019   Brendan Gregg   Column shuffle, bring back %ratio
# 15-Feb-2023   Rong Tao        Add writeback_dirty_{folio,page} tracepoints
from __future__ import print_function
from bcc import BPF
from time import sleep, strftime
import argparse
import signal
import re
from sys import argv
# signal handler
def signal_ignore(signal, frame):
    print()
# Function to gather data from /proc/meminfo
# return dictionary for quicker lookup of both values
def get_meminfo():
    result = dict()
    for line in open('/proc/meminfo'):
        k = line.split(':', 3)
        v = k[1].split()
        result[k[0]] = int(v[0])
    return result
# set global variables
mpa = 0
mbd = 0
apcl = 0
apd = 0
total = 0
misses = 0
hits = 0
debug = 0
# arguments
parser = argparse.ArgumentParser(
    description="Count cache kernel function calls",
    formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("-T", "--timestamp", action="store_true",
    help="include timestamp on output")
parser.add_argument("interval", nargs="?", default=1,
    help="output interval, in seconds")
parser.add_argument("count", nargs="?", default=-1,
    help="number of outputs")
parser.add_argument("--ebpf", action="store_true",
    help=argparse.SUPPRESS)
args = parser.parse_args()
count = int(args.count)
tstamp = args.timestamp
interval = int(args.interval)
# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
struct key_t {
    // NF_{APCL,MPA,MBD,APD}
    u32 nf;
};
enum {
    NF_APCL,
    NF_MPA,
    NF_MBD,
    NF_APD,
};
BPF_HASH(counts, struct key_t);
static int __do_count(void *ctx, u32 nf) {
    struct key_t key = {};
    u64 ip;
    key.nf = nf;
    counts.atomic_increment(key); // update counter
    return 0;
}
int do_count_apcl(struct pt_regs *ctx) {
    return __do_count(ctx, NF_APCL);
}
int do_count_mpa(struct pt_regs *ctx) {
    return __do_count(ctx, NF_MPA);
}
int do_count_mbd(struct pt_regs *ctx) {
    return __do_count(ctx, NF_MBD);
}
int do_count_apd(struct pt_regs *ctx) {
    return __do_count(ctx, NF_APD);
}
int do_count_apd_tp(void *ctx) {
    return __do_count(ctx, NF_APD);
}
"""
if debug or args.ebpf:
    print(bpf_text)
    if args.ebpf:
        exit()
# load BPF program
b = BPF(text=bpf_text)
b.attach_kprobe(event="add_to_page_cache_lru", fn_name="do_count_apcl")
b.attach_kprobe(event="mark_page_accessed", fn_name="do_count_mpa")
# Function account_page_dirtied() is changed to folio_account_dirtied() in 5.15.
# Both folio_account_dirtied() and account_page_dirtied() are
# static functions and they may be gone during compilation and this may
# introduce some inaccuracy, use tracepoint writeback_dirty_{page,folio},
# instead when attaching kprobe fails, and report the running
# error in time.
if BPF.get_kprobe_functions(b'folio_account_dirtied'):
    b.attach_kprobe(event="folio_account_dirtied", fn_name="do_count_apd")
elif BPF.get_kprobe_functions(b'account_page_dirtied'):
    b.attach_kprobe(event="account_page_dirtied", fn_name="do_count_apd")
elif BPF.tracepoint_exists("writeback", "writeback_dirty_folio"):
    b.attach_tracepoint(tp="writeback:writeback_dirty_folio", fn_name="do_count_apd_tp")
elif BPF.tracepoint_exists("writeback", "writeback_dirty_page"):
    b.attach_tracepoint(tp="writeback:writeback_dirty_page", fn_name="do_count_apd_tp")
else:
    raise Exception("Failed to attach kprobe %s or %s or any tracepoint" %
                    ("folio_account_dirtied", "account_page_dirtied"))
b.attach_kprobe(event="mark_buffer_dirty", fn_name="do_count_mbd")
# check whether hash table batch ops is supported
htab_batch_ops = True if BPF.kernel_struct_has_field(b'bpf_map_ops',
        b'map_lookup_and_delete_batch') == 1 else False
# header
if tstamp:
    print("%-8s " % "TIME", end="")
print("%8s %8s %8s %8s %12s %10s" %
     ("HITS", "MISSES", "DIRTIES", "HITRATIO", "BUFFERS_MB", "CACHED_MB"))
loop = 0
exiting = 0
while 1:
    if count > 0:
        loop += 1
        if loop > count:
            exit()
    try:
        sleep(interval)
    except KeyboardInterrupt:
        exiting = 1
        # as cleanup can take many seconds, trap Ctrl-C:
        signal.signal(signal.SIGINT, signal_ignore)
    counts = b["counts"]
    for k, v in sorted(counts.items_lookup_and_delete_batch()
                                if htab_batch_ops else counts.items(),
                                key=lambda counts: counts[1].value):
        # partial string matches in case of .isra (necessary?)
        if k.nf == 0: # NF_APCL
            apcl = max(0, v.value)
        if k.nf == 1: # NF_MPA
            mpa = max(0, v.value)
        if k.nf == 2: # NF_MBD
            mbd = max(0, v.value)
        if k.nf == 3: # NF_APD
            apd = max(0, v.value)
    # total = total cache accesses without counting dirties
    # misses = total of add to lru because of read misses
    total = mpa - mbd
    misses = apcl - apd
    if misses < 0:
        misses = 0
    if total < 0:
        total = 0
    hits = total - misses
    # If hits are < 0, then its possible misses are overestimated
    # due to possibly page cache read ahead adding more pages than
    # needed. In this case just assume misses as total and reset hits.
    if hits < 0:
        misses = total
        hits = 0
    ratio = 0
    if total > 0:
        ratio = float(hits) / total
    if debug:
        print("%d %d %d %d %d %d %d\n" %
        (mpa, mbd, apcl, apd, total, misses, hits))
    if not htab_batch_ops:
        counts.clear()
    # Get memory info
    mem = get_meminfo()
    cached = int(mem["Cached"]) / 1024
    buff = int(mem["Buffers"]) / 1024
    if tstamp:
        print("%-8s " % strftime("%H:%M:%S"), end="")
    print("%8d %8d %8d %7.2f%% %12.0f %10.0f" %
        (hits, misses, mbd, 100 * ratio, buff, cached))
    mpa = mbd = apcl = apd = total = misses = hits = cached = buff = 0
    if exiting:
        print("Detaching...")
        exit()