On 2/19/2025 11:49 AM, Alexey Kardashevskiy wrote: > > > On 19/2/25 12:20, Chenyi Qiang wrote: >> >> >> On 2/18/2025 5:19 PM, Alexey Kardashevskiy wrote: >>> >>> >> >> [..] >> >>>> diff --git a/include/system/memory-attribute-manager.h b/include/ >>>> system/memory-attribute-manager.h >>>> new file mode 100644 >>>> index 0000000000..72adc0028e >>>> --- /dev/null >>>> +++ b/include/system/memory-attribute-manager.h >>>> @@ -0,0 +1,42 @@ >>>> +/* >>>> + * QEMU memory attribute manager >>>> + * >>>> + * Copyright Intel >>>> + * >>>> + * Author: >>>> + * Chenyi Qiang <chenyi.qiang@xxxxxxxxx> >>>> + * >>>> + * This work is licensed under the terms of the GNU GPL, version 2 or >>>> later. >>>> + * See the COPYING file in the top-level directory >>>> + * >>>> + */ >>>> + >>>> +#ifndef SYSTEM_MEMORY_ATTRIBUTE_MANAGER_H >>>> +#define SYSTEM_MEMORY_ATTRIBUTE_MANAGER_H >>>> + >>>> +#include "system/hostmem.h" >>>> + >>>> +#define TYPE_MEMORY_ATTRIBUTE_MANAGER "memory-attribute-manager" >>>> + >>>> +OBJECT_DECLARE_TYPE(MemoryAttributeManager, >>>> MemoryAttributeManagerClass, MEMORY_ATTRIBUTE_MANAGER) >>>> + >>>> +struct MemoryAttributeManager { >>>> + Object parent; >>>> + >>>> + MemoryRegion *mr; >>>> + >>>> + /* 1-setting of the bit represents the memory is populated >>>> (shared) */ >>>> + int32_t bitmap_size; >>> >>> unsigned. >>> >>> Also, do either s/bitmap_size/shared_bitmap_size/ or >>> s/shared_bitmap/bitmap/ >> >> Will change it. Thanks. >> >>> >>> >>> >>>> + unsigned long *shared_bitmap; >>>> + >>>> + QLIST_HEAD(, RamDiscardListener) rdl_list; >>>> +}; >>>> + >>>> +struct MemoryAttributeManagerClass { >>>> + ObjectClass parent_class; >>>> +}; >>>> + >>>> +int memory_attribute_manager_realize(MemoryAttributeManager *mgr, >>>> MemoryRegion *mr); >>>> +void memory_attribute_manager_unrealize(MemoryAttributeManager *mgr); >>>> + >>>> +#endif >>>> diff --git a/system/memory-attribute-manager.c b/system/memory- >>>> attribute-manager.c >>>> new file mode 100644 >>>> index 0000000000..ed97e43dd0 >>>> --- /dev/null >>>> +++ b/system/memory-attribute-manager.c >>>> @@ -0,0 +1,292 @@ >>>> +/* >>>> + * QEMU memory attribute manager >>>> + * >>>> + * Copyright Intel >>>> + * >>>> + * Author: >>>> + * Chenyi Qiang <chenyi.qiang@xxxxxxxxx> >>>> + * >>>> + * This work is licensed under the terms of the GNU GPL, version 2 or >>>> later. >>>> + * See the COPYING file in the top-level directory >>>> + * >>>> + */ >>>> + >>>> +#include "qemu/osdep.h" >>>> +#include "qemu/error-report.h" >>>> +#include "system/memory-attribute-manager.h" >>>> + >>>> +OBJECT_DEFINE_TYPE_WITH_INTERFACES(MemoryAttributeManager, >>>> + memory_attribute_manager, >>>> + MEMORY_ATTRIBUTE_MANAGER, >>>> + OBJECT, >>>> + { TYPE_RAM_DISCARD_MANAGER }, >>>> + { }) >>>> + >>>> +static int memory_attribute_manager_get_block_size(const >>>> MemoryAttributeManager *mgr) >>>> +{ >>>> + /* >>>> + * Because page conversion could be manipulated in the size of at >>>> least 4K or 4K aligned, >>>> + * Use the host page size as the granularity to track the memory >>>> attribute. >>>> + * TODO: if necessary, switch to get the page_size from RAMBlock. >>>> + * i.e. mgr->mr->ram_block->page_size. >>> >>> I'd assume it is rather necessary already. >> >> OK, Will return the page_size of RAMBlock directly. >> >>> >>>> + */ >>>> + return qemu_real_host_page_size(); >>>> +} >>>> + >>>> + >>>> +static bool memory_attribute_rdm_is_populated(const RamDiscardManager >>>> *rdm, >>>> + const >>>> MemoryRegionSection *section) >>>> +{ >>>> + const MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm); >>>> + int block_size = memory_attribute_manager_get_block_size(mgr); >>>> + uint64_t first_bit = section->offset_within_region / block_size; >>>> + uint64_t last_bit = first_bit + int128_get64(section->size) / >>>> block_size - 1; >>>> + unsigned long first_discard_bit; >>>> + >>>> + first_discard_bit = find_next_zero_bit(mgr->shared_bitmap, >>>> last_bit + 1, first_bit); >>>> + return first_discard_bit > last_bit; >>>> +} >>>> + >>>> +typedef int (*memory_attribute_section_cb)(MemoryRegionSection *s, >>>> void *arg); >>>> + >>>> +static int memory_attribute_notify_populate_cb(MemoryRegionSection >>>> *section, void *arg) >>>> +{ >>>> + RamDiscardListener *rdl = arg; >>>> + >>>> + return rdl->notify_populate(rdl, section); >>>> +} >>>> + >>>> +static int memory_attribute_notify_discard_cb(MemoryRegionSection >>>> *section, void *arg) >>>> +{ >>>> + RamDiscardListener *rdl = arg; >>>> + >>>> + rdl->notify_discard(rdl, section); >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +static int memory_attribute_for_each_populated_section(const >>>> MemoryAttributeManager *mgr, >>>> + >>>> MemoryRegionSection *section, >>>> + void *arg, >>>> + >>>> memory_attribute_section_cb cb) >>>> +{ >>>> + unsigned long first_one_bit, last_one_bit; >>>> + uint64_t offset, size; >>>> + int block_size = memory_attribute_manager_get_block_size(mgr); >>>> + int ret = 0; >>>> + >>>> + first_one_bit = section->offset_within_region / block_size; >>>> + first_one_bit = find_next_bit(mgr->shared_bitmap, mgr- >>>>> bitmap_size, first_one_bit); >>>> + >>>> + while (first_one_bit < mgr->bitmap_size) { >>>> + MemoryRegionSection tmp = *section; >>>> + >>>> + offset = first_one_bit * block_size; >>>> + last_one_bit = find_next_zero_bit(mgr->shared_bitmap, mgr- >>>>> bitmap_size, >>>> + first_one_bit + 1) - 1; >>>> + size = (last_one_bit - first_one_bit + 1) * block_size; >>> >>> >>> What all this math is for if we stuck with VFIO doing 1 page at the >>> time? (I think I commented on this) >> >> Sorry, I missed your previous comment. IMHO, as we track the status in >> bitmap and we want to call the cb() on the shared part within >> MemoryRegionSection. Here we do the calculation to find the expected >> sub-range. > > > You find a largest intersection here and call cb() on it which will call > VFIO with 1 page at the time. So you could just call cb() for every page > from here which will make the code simpler. I prefer to keep calling cb() on a large intersection . I think in future after cut_mapping is supported, we don't need to make VFIO call 1 page at a time. VFIO can call on the large range directly. In addition, calling cb() for every page seems specific to VFIO usage. It is more generic to call on a large intersection. If more RDM listener added in future(although VFIO is the only user currently), do the split in caller is inefficient. > > >>> >>>> + >>>> + if (!memory_region_section_intersect_range(&tmp, offset, >>>> size)) { >>>> + break; >>>> + } >>>> + >>>> + ret = cb(&tmp, arg); >>>> + if (ret) { >>>> + error_report("%s: Failed to notify RAM discard listener: >>>> %s", __func__, >>>> + strerror(-ret)); >>>> + break; >>>> + } >>>> + >>>> + first_one_bit = find_next_bit(mgr->shared_bitmap, mgr- >>>>> bitmap_size, >>>> + last_one_bit + 2); >>>> + } >>>> + >>>> + return ret; >>>> +} >>>> + >> >> [..] >> >>>> + >>>> +static void >>>> memory_attribute_rdm_unregister_listener(RamDiscardManager *rdm, >>>> + >>>> RamDiscardListener *rdl) >>>> +{ >>>> + MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm); >>>> + int ret; >>>> + >>>> + g_assert(rdl->section); >>>> + g_assert(rdl->section->mr == mgr->mr); >>>> + >>>> + ret = memory_attribute_for_each_populated_section(mgr, rdl- >>>>> section, rdl, >>>> + >>>> memory_attribute_notify_discard_cb); >>>> + if (ret) { >>>> + error_report("%s: Failed to unregister RAM discard listener: >>>> %s", __func__, >>>> + strerror(-ret)); >>>> + } >>>> + >>>> + memory_region_section_free_copy(rdl->section); >>>> + rdl->section = NULL; >>>> + QLIST_REMOVE(rdl, next); >>>> + >>>> +} >>>> + >>>> +typedef struct MemoryAttributeReplayData { >>>> + void *fn; >>> >>> ReplayRamDiscard *fn, not void*. >> >> We could cast the void *fn either to ReplayRamPopulate or >> ReplayRamDiscard (see below). > > > Hard to read, hard to maintain, and they take same parameters, only the > return value is different (int/void) - if this is really important, have > 2 fn pointers in MemoryAttributeReplayData. It is already hard to follow > this train on callbacks. Actually, I prefer to make ReplayRamDiscard and ReplayRamPopulate unified. Make ReplayRamDiscard() also return int. Then we only need to define one function like: typedef int (*ReplayMemoryAttributeChange)(MemoryRegionSection *section, void *opaque); Maybe David can share his opinions. > > >>>> + void *opaque; >>>> +} MemoryAttributeReplayData; >>>> + >>>> +static int >>>> memory_attribute_rdm_replay_populated_cb(MemoryRegionSection *section, >>>> void *arg) >>>> +{ >>>> + MemoryAttributeReplayData *data = arg; >>>> + >>>> + return ((ReplayRamPopulate)data->fn)(section, data->opaque); >>>> +} >>>> + >>>> +static int memory_attribute_rdm_replay_populated(const >>>> RamDiscardManager *rdm, >>>> + MemoryRegionSection >>>> *section, >>>> + ReplayRamPopulate >>>> replay_fn, >>>> + void *opaque) >>>> +{ >>>> + MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm); >>>> + MemoryAttributeReplayData data = { .fn = replay_fn, .opaque = >>>> opaque }; >>>> + >>>> + g_assert(section->mr == mgr->mr); >>>> + return memory_attribute_for_each_populated_section(mgr, section, >>>> &data, >>>> + >>>> memory_attribute_rdm_replay_populated_cb); >>>> +} >>>> + >>>> +static int >>>> memory_attribute_rdm_replay_discarded_cb(MemoryRegionSection *section, >>>> void *arg) >>>> +{ >>>> + MemoryAttributeReplayData *data = arg; >>>> + >>>> + ((ReplayRamDiscard)data->fn)(section, data->opaque); >>>> + return 0; >>>> +} >>>> + >>>> +static void memory_attribute_rdm_replay_discarded(const >>>> RamDiscardManager *rdm, >>>> + MemoryRegionSection >>>> *section, >>>> + ReplayRamDiscard >>>> replay_fn, >>>> + void *opaque) >>>> +{ >>>> + MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm); >>>> + MemoryAttributeReplayData data = { .fn = replay_fn, .opaque = >>>> opaque }; >>>> + >>>> + g_assert(section->mr == mgr->mr); >>>> + memory_attribute_for_each_discarded_section(mgr, section, &data, >>>> + >>>> memory_attribute_rdm_replay_discarded_cb); >>>> +} >>>> + >>>> +int memory_attribute_manager_realize(MemoryAttributeManager *mgr, >>>> MemoryRegion *mr) >>>> +{ >>>> + uint64_t bitmap_size; >>>> + int block_size = memory_attribute_manager_get_block_size(mgr); >>>> + int ret; >>>> + >>>> + bitmap_size = ROUND_UP(mr->size, block_size) / block_size; >>>> + >>>> + mgr->mr = mr; >>>> + mgr->bitmap_size = bitmap_size; >>>> + mgr->shared_bitmap = bitmap_new(bitmap_size); >>>> + >>>> + ret = memory_region_set_ram_discard_manager(mgr->mr, >>>> RAM_DISCARD_MANAGER(mgr)); >>> >>> Move it 3 lines up and avoid stale data in mgr->mr/bitmap_size/ >>> shared_bitmap and avoid g_free below? >> >> Make sense. I will move it up the same as patch 02 before bitmap_new(). >> >>> >>>> + if (ret) { >>>> + g_free(mgr->shared_bitmap); >>>> + } >>>> + >>>> + return ret; >>>> +} >>>> + >>>> +void memory_attribute_manager_unrealize(MemoryAttributeManager *mgr) >>>> +{ >>>> + memory_region_set_ram_discard_manager(mgr->mr, NULL); >>>> + >>>> + g_free(mgr->shared_bitmap); >>>> +} >>>> + >>>> +static void memory_attribute_manager_init(Object *obj) >>> >>> Not used. >>> >>>> +{ >>>> + MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(obj); >>>> + >>>> + QLIST_INIT(&mgr->rdl_list); >>>> +} > + >>>> +static void memory_attribute_manager_finalize(Object *obj) >>> >>> Not used either. Thanks, >> >> I think it is OK to define it as a placeholder? Just some preference. > > At very least gcc should warn on these (I am surprised it did not) and > nobody likes this. Thanks, I tried a little. They must be defined. The init() and finalize() calls are used in the OBJECT_DEFINE_TYPE_WITH_INTERFACES() macro. I think it is a common template to define in this way. > > >>> >>>> +{ >>>> +} >>>> + >>>> +static void memory_attribute_manager_class_init(ObjectClass *oc, void >>>> *data) >>>> +{ >>>> + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(oc); >>>> + >>>> + rdmc->get_min_granularity = >>>> memory_attribute_rdm_get_min_granularity; >>>> + rdmc->register_listener = memory_attribute_rdm_register_listener; >>>> + rdmc->unregister_listener = >>>> memory_attribute_rdm_unregister_listener; >>>> + rdmc->is_populated = memory_attribute_rdm_is_populated; >>>> + rdmc->replay_populated = memory_attribute_rdm_replay_populated; >>>> + rdmc->replay_discarded = memory_attribute_rdm_replay_discarded; >>>> +} >>>> diff --git a/system/meson.build b/system/meson.build >>>> index 4952f4b2c7..ab07ff1442 100644 >>>> --- a/system/meson.build >>>> +++ b/system/meson.build >>>> @@ -15,6 +15,7 @@ system_ss.add(files( >>>> 'dirtylimit.c', >>>> 'dma-helpers.c', >>>> 'globals.c', >>>> + 'memory-attribute-manager.c', >>>> 'memory_mapping.c', >>>> 'qdev-monitor.c', >>>> 'qtest.c', >>> >> >