--- hatari/src/debug/profilecpu.c 2019/04/09 08:54:23 1.1.1.2 +++ hatari/src/debug/profilecpu.c 2019/04/09 08:59:20 1.1.1.6 @@ -1,7 +1,7 @@ /* * Hatari - profilecpu.c * - * Copyright (C) 2010-2013 by Eero Tamminen + * Copyright (C) 2010-2015 by Eero Tamminen * * This file is distributed under the GNU General Public License, version 2 * or at your option any later version. Read the file gpl.txt for details. @@ -20,10 +20,11 @@ const char Profilecpu_fileid[] = "Hatari #include "dsp.h" #include "m68000.h" #include "68kDisass.h" +#include "symbols.h" #include "profile.h" #include "profile_priv.h" +#include "debug_priv.h" #include "stMemory.h" -#include "symbols.h" #include "tos.h" #include "screen.h" #include "video.h" @@ -34,11 +35,17 @@ const char Profilecpu_fileid[] = "Hatari #define CART_END 0xFC0000 #define CART_SIZE (CART_END - CART_START) +#define TTRAM_START 0x01000000 /* if non-zero, output (more) warnings on suspicious: * - cycle/instruction counts * - PC switches - * And drop to debugger on invalid PC addresses. + * And drop to debugger on invalid current & previous PC addresses. + * + * NOTE: DebugUI() calls that DEBUG define enables, can cause + * instruction count mismatch assertions because debugger invocation + * resets the counters AND happens in middle of data collection. + * It's best to quit after debugging the issue ('q' command). */ #define DEBUG 0 #if DEBUG @@ -46,42 +53,56 @@ const char Profilecpu_fileid[] = "Hatari static bool skip_assert; #endif -static callinfo_t cpu_callinfo; +/* whether to track & show all cache stats for all instructions */ +#define DEBUG_CACHE 0 -/* This is relevant with WinUAE CPU core: - * - the default cycle exact variant needs this define to be non-zero - * - non-cycle exact and MMU variants need this define to be 0 - * for cycle counts to make any sense - */ -#define USE_CYCLES_COUNTER 1 + +static callinfo_t cpu_callinfo; #define MAX_CPU_PROFILE_VALUE 0xFFFFFFFF typedef struct { - Uint32 count; /* how many times this address instrcution is executed */ + Uint32 count; /* how many times this address instruction is executed */ Uint32 cycles; /* how many CPU cycles was taken at this address */ - Uint32 misses; /* how many CPU cache misses happened at this address */ +#if DEBUG_CACHE /* track also less relevant cache events */ + Uint32 i_hits; /* how many CPU i-cache hits happened at this address */ + Uint32 d_misses; /* how many CPU d-cache misses happened at this address */ +#endif + Uint32 i_misses; /* how many CPU i-cache misses happened at this address */ + Uint32 d_hits; /* how many CPU d-cache hits happened at this address */ } cpu_profile_item_t; -#define MAX_MISS 4 + +/* max count of hits/misses single instruction can trigger at once */ +#define MAX_I_HITS 8 +#define MAX_I_MISSES 8 +#define MAX_D_HITS 32 +#define MAX_D_MISSES 20 static struct { counters_t all; /* total counts for all areas */ - Uint32 miss_counts[MAX_MISS]; /* cache miss counts */ cpu_profile_item_t *data; /* profile data items */ Uint32 size; /* number of allocated profile data items */ + profile_area_t ttram; /* TT-RAM stats */ profile_area_t ram; /* normal RAM stats */ profile_area_t rom; /* cartridge ROM stats */ profile_area_t tos; /* ROM TOS stats */ int active; /* number of active data items in all areas */ Uint32 *sort_arr; /* data indexes used for sorting */ int prev_family; /* previous instruction opcode family */ - Uint32 prev_cycles; /* previous instruction cycles counter */ + Uint64 prev_cycles; /* previous instruction cycles counter */ Uint32 prev_pc; /* previous instruction address */ Uint32 loop_start; /* address of last loop start */ Uint32 loop_end; /* address of last loop end */ Uint32 loop_count; /* how many times it was looped */ Uint32 disasm_addr; /* 'addresses' command start address */ +#if ENABLE_WINUAE_CPU + Uint32 i_prefetched; /* instructions that don't incur prefetch hit/miss */ + Uint32 i_hit_counts[MAX_I_HITS]; /* I-cache hit counts */ + Uint32 d_hit_counts[MAX_D_HITS]; /* D-cache hit counts */ + Uint32 i_miss_counts[MAX_I_MISSES]; /* I-cache miss counts */ + Uint32 d_miss_counts[MAX_D_MISSES]; /* D-cache miss counts */ +#endif bool processed; /* true when data is already processed */ bool enabled; /* true when profiling enabled */ } cpu_profile; @@ -121,10 +142,14 @@ static inline Uint32 address2index(Uint3 /* and after TOS as it's higher */ pc += TosSize; } +#if ENABLE_WINUAE_CPU + } else if (TTmemory && pc >= TTRAM_START && pc < TTRAM_START + 1024*(unsigned)ConfigureParams.Memory.TTRamSize_KB) { + pc += STRamEnd + TosSize + CART_SIZE - TTRAM_START; +#endif } else { fprintf(stderr, "WARNING: 'invalid' CPU PC profile instruction address 0x%x!\n", pc); /* extra entry at end is reserved for invalid PC values */ - pc = STRamEnd + TosSize + 0x20000; + pc = STRamEnd + TosSize + CART_SIZE; #if DEBUG skip_assert = true; DebugUI(REASON_CPU_EXCEPTION); @@ -153,7 +178,10 @@ static Uint32 index2address(Uint32 idx) } idx -= TosSize; /* ROM */ - return idx + CART_START; + if (idx < CART_SIZE) { + return idx + CART_START; + } + idx -= CART_SIZE; } else { /* ROM */ if (idx < CART_SIZE) { @@ -161,32 +189,51 @@ static Uint32 index2address(Uint32 idx) } idx -= CART_SIZE; /* TOS */ - return idx + TosAddress; + if (idx < TosSize) { + return idx + TosAddress; + } + idx -= TosSize; } + return idx + TTRAM_START; } /* ------------------ CPU profile results ----------------- */ /** - * Get CPU cycles, count and count percentage for given address. + * Write string containing CPU cache stats, cycles, count, count percentage + * for given address to provided buffer. + * * Return true if data was available and non-zero, false otherwise. */ -bool Profile_CpuAddressData(Uint32 addr, float *percentage, Uint32 *count, Uint32 *cycles, Uint32 *misses) +bool Profile_CpuAddressDataStr(char *buffer, size_t maxlen, Uint32 addr) { + cpu_profile_item_t *item; + float percentage; Uint32 idx; + + assert(buffer && maxlen); if (!cpu_profile.data) { return false; } idx = address2index(addr); - *misses = cpu_profile.data[idx].misses; - *cycles = cpu_profile.data[idx].cycles; - *count = cpu_profile.data[idx].count; + item = &(cpu_profile.data[idx]); + if (cpu_profile.all.count) { - *percentage = 100.0*(*count)/cpu_profile.all.count; + percentage = 100.0 * item->count / cpu_profile.all.count; } else { - *percentage = 0.0; + percentage = 0.0; } - return (*count > 0); +#if DEBUG_CACHE + snprintf(buffer, maxlen, "%5.2f%% (%u, %u, %u, %u, %u, %u)", + percentage, item->count, item->cycles, + item->i_hits, item->i_misses, + item->d_hits, item->d_misses); +#else + snprintf(buffer, maxlen, "%5.2f%% (%u, %u, %u, %u)", + percentage, item->count, item->cycles, + item->i_misses, item->d_hits); +#endif + return (item->count > 0); } /** @@ -201,23 +248,27 @@ static void show_cpu_area_stats(profile_ fprintf(stderr, "- active address range:\n 0x%06x-0x%06x\n", index2address(area->lowest), index2address(area->highest)); - fprintf(stderr, "- active instruction addresses:\n %d (%.2f%% of all)\n", + fprintf(stderr, "- active instruction addresses:\n %d (%.2f%% of all areas)\n", area->active, 100.0 * area->active / cpu_profile.active); - fprintf(stderr, "- executed instructions:\n %"PRIu64" (%.2f%% of all)\n", + fprintf(stderr, "- executed instructions:\n %"PRIu64" (%.2f%% of all areas)\n", area->counters.count, 100.0 * area->counters.count / cpu_profile.all.count); -#if ENABLE_WINUAE_CPU - if (cpu_profile.all.misses) { /* CPU cache in use? */ - fprintf(stderr, "- instruction cache misses:\n %"PRIu64" (%.2f%% of all)\n", - area->counters.misses, - 100.0 * area->counters.misses / cpu_profile.all.misses); + /* CPU cache in use? */ + if (cpu_profile.all.i_misses) { + fprintf(stderr, "- instruction cache misses:\n %"PRIu64" (%.2f%% of all areas)\n", + area->counters.i_misses, + 100.0 * area->counters.i_misses / cpu_profile.all.i_misses); + } + if (cpu_profile.all.d_hits) { + fprintf(stderr, "- data cache hits:\n %"PRIu64" (%.2f%% of all areas)\n", + area->counters.d_hits, + 100.0 * area->counters.d_hits / cpu_profile.all.d_hits); } -#endif - fprintf(stderr, "- used cycles:\n %"PRIu64" (%.2f%% of all)\n = %.5fs\n", + fprintf(stderr, "- used cycles:\n %"PRIu64" (%.2f%% of all areas)\n = %.5fs\n", area->counters.cycles, 100.0 * area->counters.cycles / cpu_profile.all.cycles, - (double)area->counters.cycles / MachineClocks.CPU_Freq); + (double)area->counters.cycles / MachineClocks.CPU_Freq_Emul); if (area->overflow) { fprintf(stderr, " *** COUNTER OVERFLOW! ***\n"); } @@ -238,28 +289,79 @@ void Profile_CpuShowStats(void) fprintf(stderr, "Cartridge ROM (0x%X-%X):\n", CART_START, CART_END); show_cpu_area_stats(&cpu_profile.rom); + if (TTmemory && ConfigureParams.Memory.TTRamSize_KB) { + fprintf(stderr, "TT-RAM (0x%X-%X):\n", TTRAM_START, TTRAM_START + 1024*ConfigureParams.Memory.TTRamSize_KB); + show_cpu_area_stats(&cpu_profile.ttram); + } + fprintf(stderr, "\n= %.5fs\n", - (double)cpu_profile.all.cycles / MachineClocks.CPU_Freq); + (double)cpu_profile.all.cycles / MachineClocks.CPU_Freq_Emul); +} #if ENABLE_WINUAE_CPU - if (cpu_profile.all.misses) { /* CPU cache in use? */ - int i; - fprintf(stderr, "\nCache misses per instruction, number of occurrences:\n"); - for (i = 0; i < MAX_MISS; i++) { - fprintf(stderr, "- %d: %d\n", i, cpu_profile.miss_counts[i]); +/** + * show percentage histogram of given array items + */ +static void show_histogram(const char *title, int count, Uint32 *items) +{ + const Uint64 maxval = cpu_profile.all.count; + Uint32 value; + int i; + + fprintf(stderr, "\n%s, number of occurrences:\n", title); + for (i = 0; i < count; i++) { + value = items[i]; + if (value) { + int w, width = 50 * value / maxval+1; + fprintf(stderr, " %2d: ", i); + for (w = 0; w < width; w++) { + fputc('#', stderr); + } + fprintf(stderr, " %.3f%%\n", 100.0 * value / maxval); } } -#endif } /** + * show CPU cache usage histograms + */ +void Profile_CpuShowCaches(void) +{ + if (!(cpu_profile.all.i_misses || cpu_profile.all.d_hits)) { + fprintf(stderr, "No instruction/data cache information.\n"); + return; + } + fprintf(stderr, + "\nNote:\n" + "- these statistics include all profiled instructions, but\n" + "- instruction cache events happen only on prefetch/branch\n" + "- data cache events can happen only for instructions that do memory reads\n" + "\nAlready prefetched instructions: %.3f%% (no hits/misses)\n", + 100.0 * cpu_profile.i_prefetched / cpu_profile.all.count); + + show_histogram("Instruction cache hits per instruction", + ARRAY_SIZE(cpu_profile.i_hit_counts), cpu_profile.i_hit_counts); + show_histogram("Instruction cache misses per instruction", + ARRAY_SIZE(cpu_profile.i_miss_counts), cpu_profile.i_miss_counts); + show_histogram("Data cache hits per instruction", + ARRAY_SIZE(cpu_profile.d_hit_counts), cpu_profile.d_hit_counts); + show_histogram("Data cache misses per instruction", + ARRAY_SIZE(cpu_profile.d_miss_counts), cpu_profile.d_miss_counts); +} +#else +void Profile_CpuShowCaches(void) { + fprintf(stderr, "Cache information is recorded only with WinUAE CPU.\n"); +} +#endif + +/** * Show CPU instructions which execution was profiled, in the address order, * starting from the given address. Return next disassembly address. */ -Uint32 Profile_CpuShowAddresses(Uint32 lower, Uint32 upper, FILE *out) +Uint32 Profile_CpuShowAddresses(Uint32 lower, Uint32 upper, FILE *out, paging_t use_paging) { int oldcols[DISASM_COLUMNS], newcols[DISASM_COLUMNS]; - int show, shown, active; + int show, shown, addrs, active; const char *symbol; cpu_profile_item_t *data; Uint32 idx, end, size; @@ -275,15 +377,17 @@ Uint32 Profile_CpuShowAddresses(Uint32 l active = cpu_profile.active; if (upper) { end = address2index(upper); - show = active; if (end > size) { end = size; } } else { end = size; - show = ConfigureParams.Debugger.nDisasmLines; - if (!show || show > active) { - show = active; + } + show = INT_MAX; + if (use_paging == PAGING_ENABLED) { + show = DebugUI_GetPageLines(ConfigureParams.Debugger.nDisasmLines, 0); + if (!show) { + show = INT_MAX; } } @@ -292,28 +396,36 @@ Uint32 Profile_CpuShowAddresses(Uint32 l Disasm_DisableColumn(DISASM_COLUMN_HEXDUMP, oldcols, newcols); Disasm_SetColumns(newcols); - fputs("# disassembly with profile data: % (, , )\n", out); + fputs("# disassembly with profile data: % (, , , )\n", out); + shown = 2; /* first and last printf */ - nextpc = 0; + addrs = nextpc = 0; idx = address2index(lower); - for (shown = 0; shown < show && idx < end; idx++) { + for (; shown < show && addrs < active && idx < end; idx++) { if (!data[idx].count) { continue; } addr = index2address(idx); if (addr != nextpc && nextpc) { fprintf(out, "[...]\n"); + shown++; } - symbol = Symbols_GetByCpuAddress(addr); + symbol = Symbols_GetByCpuAddress(addr, SYMTYPE_TEXT); if (symbol) { fprintf(out, "%s:\n", symbol); + shown++; } /* NOTE: column setup works only with 68kDisass disasm engine! */ Disasm(out, addr, &nextpc, 1); shown++; + addrs++; + } + if (idx < end) { + printf("Disassembled %d (of active %d) CPU addresses.\n", addrs, active); + } else { + printf("Disassembled last %d (of active %d) CPU addresses, wrapping...\n", addrs, active); + nextpc = 0; } - printf("Disassembled %d (of active %d) CPU addresses.\n", shown, active); - /* restore disassembly columns */ Disasm_SetColumns(oldcols); return nextpc; @@ -342,10 +454,10 @@ static void leave_instruction_column(int /** * compare function for qsort() to sort CPU profile data by instruction cache misses. */ -static int cmp_cpu_misses(const void *p1, const void *p2) +static int cmp_cpu_i_misses(const void *p1, const void *p2) { - Uint32 count1 = cpu_profile.data[*(const Uint32*)p1].misses; - Uint32 count2 = cpu_profile.data[*(const Uint32*)p2].misses; + Uint32 count1 = cpu_profile.data[*(const Uint32*)p1].i_misses; + Uint32 count2 = cpu_profile.data[*(const Uint32*)p2].i_misses; if (count1 > count2) { return -1; } @@ -358,7 +470,61 @@ static int cmp_cpu_misses(const void *p1 /** * Sort CPU profile data addresses by instruction cache misses and show the results. */ -void Profile_CpuShowMisses(int show) +void Profile_CpuShowInstrMisses(int show) +{ + int active; + int oldcols[DISASM_COLUMNS]; + Uint32 *sort_arr, *end, addr, nextpc; + cpu_profile_item_t *data = cpu_profile.data; + float percentage; + Uint32 count; + + if (!cpu_profile.all.i_misses) { + fprintf(stderr, "No CPU instruction cache miss information available.\n"); + return; + } + + active = cpu_profile.active; + sort_arr = cpu_profile.sort_arr; + qsort(sort_arr, active, sizeof(*sort_arr), cmp_cpu_i_misses); + + leave_instruction_column(oldcols); + + printf("addr:\t\ti-cache misses:\n"); + show = (show < active ? show : active); + for (end = sort_arr + show; sort_arr < end; sort_arr++) { + addr = index2address(*sort_arr); + count = data[*sort_arr].i_misses; + percentage = 100.0*count/cpu_profile.all.i_misses; + printf("0x%06x\t%5.2f%%\t%d%s\t", addr, percentage, count, + count == MAX_CPU_PROFILE_VALUE ? " (OVERFLOW)" : ""); + Disasm(stdout, addr, &nextpc, 1); + } + printf("%d CPU addresses listed.\n", show); + + Disasm_SetColumns(oldcols); +} + +/** + * compare function for qsort() to sort CPU profile data by data cache hits. + */ +static int cmp_cpu_d_hits(const void *p1, const void *p2) +{ + Uint32 count1 = cpu_profile.data[*(const Uint32*)p1].d_hits; + Uint32 count2 = cpu_profile.data[*(const Uint32*)p2].d_hits; + if (count1 > count2) { + return -1; + } + if (count1 < count2) { + return 1; + } + return 0; +} + +/** + * Sort CPU profile data addresses by data cache hits and show the results. + */ +void Profile_CpuShowDataHits(int show) { int active; int oldcols[DISASM_COLUMNS]; @@ -367,23 +533,23 @@ void Profile_CpuShowMisses(int show) float percentage; Uint32 count; - if (!cpu_profile.all.misses) { - fprintf(stderr, "No CPU cache miss information available.\n"); + if (!cpu_profile.all.d_hits) { + fprintf(stderr, "No CPU data cache hit information available.\n"); return; } active = cpu_profile.active; sort_arr = cpu_profile.sort_arr; - qsort(sort_arr, active, sizeof(*sort_arr), cmp_cpu_misses); + qsort(sort_arr, active, sizeof(*sort_arr), cmp_cpu_d_hits); leave_instruction_column(oldcols); - printf("addr:\t\tmisses:\n"); + printf("addr:\t\td-cache hits:\n"); show = (show < active ? show : active); for (end = sort_arr + show; sort_arr < end; sort_arr++) { addr = index2address(*sort_arr); - count = data[*sort_arr].misses; - percentage = 100.0*count/cpu_profile.all.misses; + count = data[*sort_arr].d_hits; + percentage = 100.0*count/cpu_profile.all.d_hits; printf("0x%06x\t%5.2f%%\t%d%s\t", addr, percentage, count, count == MAX_CPU_PROFILE_VALUE ? " (OVERFLOW)" : ""); Disasm(stdout, addr, &nextpc, 1); @@ -392,9 +558,13 @@ void Profile_CpuShowMisses(int show) Disasm_SetColumns(oldcols); } + #else -void Profile_CpuShowMisses(int show) { - fprintf(stderr, "Cache misses are recorded only with WinUAE CPU.\n"); +void Profile_CpuShowInstrMisses(int show) { + fprintf(stderr, "Cache information is recorded only with WinUAE CPU.\n"); +} +void Profile_CpuShowDataHits(int show) { + fprintf(stderr, "Cache information is recorded only with WinUAE CPU.\n"); } #endif @@ -512,7 +682,7 @@ void Profile_CpuShowCounts(int show, boo return; } - symbols = Symbols_CpuCount(); + symbols = Symbols_CpuCodeCount(); if (!symbols) { fprintf(stderr, "ERROR: no CPU symbols loaded!\n"); return; @@ -525,7 +695,7 @@ void Profile_CpuShowCounts(int show, boo for (end = sort_arr + active; sort_arr < end; sort_arr++) { addr = index2address(*sort_arr); - name = Symbols_GetByCpuAddress(addr); + name = Symbols_GetByCpuAddress(addr, SYMTYPE_TEXT); if (!name) { continue; } @@ -551,7 +721,7 @@ static const char * addr2name(Uint32 add { Uint32 idx = address2index(addr); *total = cpu_profile.data[idx].count; - return Symbols_GetByCpuAddress(addr); + return Symbols_GetByCpuAddress(addr, SYMTYPE_TEXT); } /** @@ -567,21 +737,29 @@ void Profile_CpuShowCallers(FILE *fp) */ void Profile_CpuSave(FILE *out) { - Uint32 text; - fputs("Field names:\tExecuted instructions, Used cycles, Instruction cache misses\n", out); - /* (Python) pegexp that matches address and all describled fields from disassembly: - * $ : % (, , ) - * $e5af38 : rts 0.00% (12, 0, 12) + Uint32 text, end; + fputs("Field names:\tExecuted instructions, Used cycles, Instruction cache misses, Data cache hits\n", out); + /* (Python) regexp that matches address and all described fields from disassembly: + * $ : % (, , , ) + * $e5af38 : rts 0.00% (12, 0, 12, 0) */ fputs("Field regexp:\t^\\$([0-9a-f]+) :.*% \\((.*)\\)$\n", out); /* some information for interpreting the addresses */ - fprintf(out, "ROM_TOS:\t0x%06x-0x%06x\n", TosAddress, TosAddress + TosSize); + fprintf(out, "ST_RAM:\t\t0x%06x-0x%06x\n", 0, STRamEnd); + end = TosAddress + TosSize; + fprintf(out, "ROM_TOS:\t0x%06x-0x%06x\n", TosAddress, end); + fprintf(out, "CARTRIDGE:\t0x%06x-0x%06x\n", CART_START, CART_END); text = DebugInfo_GetTEXT(); - if (text < TosAddress) { + if (text && (text < TosAddress || text >= TTRAM_START)) { fprintf(out, "PROGRAM_TEXT:\t0x%06x-0x%06x\n", text, DebugInfo_GetTEXTEnd()); } - fprintf(out, "CARTRIDGE:\t0x%06x-0x%06x\n", CART_START, CART_END); - Profile_CpuShowAddresses(0, CART_END-2, out); + if (TTmemory && ConfigureParams.Memory.TTRamSize_KB) { + end = TTRAM_START + 1024*ConfigureParams.Memory.TTRamSize_KB; + fprintf(out, "TT_RAM:\t\t0x%08x-0x%08x\n", TTRAM_START, end); + } else if (end < CART_END) { + end = CART_END; + } + Profile_CpuShowAddresses(0, end-2, out, PAGING_DISABLED); Profile_CpuShowCallers(out); } @@ -610,7 +788,10 @@ bool Profile_CpuStart(void) memset(&cpu_profile, 0, sizeof(cpu_profile)); /* Shouldn't change within same debug session */ - size = (STRamEnd + 0x20000 + TosSize) / 2; + size = (STRamEnd + CART_SIZE + TosSize) / 2; + if (TTmemory && ConfigureParams.Memory.TTRamSize_KB) { + size += ConfigureParams.Memory.TTRamSize_KB * 1024/2; + } /* Add one entry for catching invalid PC values */ cpu_profile.data = calloc(size + 1, sizeof(*cpu_profile.data)); @@ -622,7 +803,7 @@ bool Profile_CpuStart(void) (int)sizeof(*cpu_profile.data)*size/(1024*1024)); cpu_profile.size = size; - Profile_AllocCallinfo(&(cpu_callinfo), Symbols_CpuCount(), "CPU"); + Profile_AllocCallinfo(&(cpu_callinfo), Symbols_CpuCodeCount(), "CPU"); /* special hack for EmuTOS */ etos_switcher = PC_UNDEFINED; @@ -631,10 +812,18 @@ bool Profile_CpuStart(void) etos_switcher = PC_UNDEFINED; } - cpu_profile.prev_cycles = Cycles_GetCounter(CYCLES_COUNTER_CPU); - cpu_profile.prev_family = OpcodeFamily; - cpu_profile.prev_pc = M68000_GetPC() & 0xffffff; + /* reset cache stats (CPU emulation doesn't do that) */ + CpuInstruction.D_Cache_hit = 0; + CpuInstruction.I_Cache_hit = 0; + CpuInstruction.I_Cache_miss = 0; + CpuInstruction.D_Cache_miss = 0; + cpu_profile.prev_cycles = CyclesGlobalClockCounter; + cpu_profile.prev_family = OpcodeFamily; + cpu_profile.prev_pc = M68000_GetPC(); + if (ConfigureParams.System.bAddressSpace24) { + cpu_profile.prev_pc &= 0xffffff; + } cpu_profile.loop_start = PC_UNDEFINED; cpu_profile.loop_end = PC_UNDEFINED; cpu_profile.loop_count = 0; @@ -746,7 +935,7 @@ static void collect_calls(Uint32 pc, cou } /* address is one which we're tracking? */ - idx = Symbols_GetCpuAddressIndex(pc); + idx = Symbols_GetCpuCodeIndex(pc); if (unlikely(idx >= 0)) { flag = cpu_opcode_type(family, prev_pc, pc); @@ -768,7 +957,7 @@ static void collect_calls(Uint32 pc, cou * valid prev_pc value stored */ cpu_callinfo.return_pc = PC_UNDEFINED; - fprintf(stderr, "WARNING: previous PC from callinfo for 0x%d is undefined!\n", pc); + fprintf(stderr, "WARNING: previous PC for tracked address 0x%d is undefined!\n", pc); #if DEBUG skip_assert = true; DebugUI(REASON_CPU_EXCEPTION); @@ -805,6 +994,24 @@ static void log_last_loop(void) } } +# if DEBUG || ENABLE_WINUAE_CPU +/** + * Warning for values going out of expected range + */ +static Uint32 warn_too_large(const char *name, const int value, const int limit, const Uint32 prev_pc, const Uint32 pc) +{ + Uint32 nextpc; + fprintf(stderr, "WARNING: unexpected (%d > %d) %s at 0x%x:\n", value, limit - 1, name, pc); + Disasm(stderr, prev_pc, &nextpc, 1); + Disasm(stderr, pc, &nextpc, 1); +#if DEBUG + skip_assert = true; + DebugUI(REASON_CPU_EXCEPTION); +#endif + return limit - 1; +} +#endif + /** * Update CPU cycle and count statistics for PC address. * @@ -814,15 +1021,22 @@ static void log_last_loop(void) void Profile_CpuUpdate(void) { counters_t *counters = &(cpu_profile.all); - Uint32 pc, prev_pc, idx, cycles, misses; + Uint32 pc, prev_pc, idx, cycles; cpu_profile_item_t *prev; +#if ENABLE_WINUAE_CPU + Uint32 i_hits, d_hits, i_misses, d_misses; +#else + const Uint32 i_misses = 0, d_hits = 0; +#endif prev_pc = cpu_profile.prev_pc; - /* PC may have extra bits, they need to be masked away as + /* PC may have extra bits when using 24 bit addressing, they need to be masked away as * emulation itself does that too when PC value is used */ - cpu_profile.prev_pc = pc = M68000_GetPC() & 0xffffff; - + cpu_profile.prev_pc = pc = M68000_GetPC(); + if (ConfigureParams.System.bAddressSpace24) { + cpu_profile.prev_pc &= 0xffffff; + } if (unlikely(profile_loop.fp)) { if (pc < prev_pc) { if (pc == cpu_profile.loop_start && prev_pc == cpu_profile.loop_end) { @@ -835,7 +1049,7 @@ void Profile_CpuUpdate(void) } else { if (pc > cpu_profile.loop_end) { log_last_loop(); - cpu_profile.loop_end = 0xffffffff; + cpu_profile.loop_end = 0xffffffff; cpu_profile.loop_count = 0; } } @@ -849,22 +1063,8 @@ void Profile_CpuUpdate(void) prev->count++; } -#if USE_CYCLES_COUNTER - /* Confusingly, with DSP enabled, cycle counter is for this instruction, - * without DSP enabled, it's a monotonically increasing counter. - */ - if (bDspEnabled) { - cycles = Cycles_GetCounter(CYCLES_COUNTER_CPU); - } else { - Uint32 newcycles = Cycles_GetCounter(CYCLES_COUNTER_CPU); - cycles = newcycles - cpu_profile.prev_cycles; - cpu_profile.prev_cycles = newcycles; - } -#else - cycles = CurrentInstrCycles + nWaitStateCycles; -#endif - /* cycles are based on 8Mhz clock, change them to correct one */ - cycles <<= nCpuFreqShift; + cycles = CyclesGlobalClockCounter - cpu_profile.prev_cycles; + cpu_profile.prev_cycles = CyclesGlobalClockCounter; if (likely(prev->cycles < MAX_CPU_PROFILE_VALUE - cycles)) { prev->cycles += cycles; @@ -873,27 +1073,78 @@ void Profile_CpuUpdate(void) } #if ENABLE_WINUAE_CPU - misses = CpuInstruction.iCacheMisses; - assert(misses < MAX_MISS); - cpu_profile.miss_counts[misses]++; - if (likely(prev->misses < MAX_CPU_PROFILE_VALUE - misses)) { - prev->misses += misses; + /* only WinUAE CPU core provides cache information */ + i_hits = CpuInstruction.I_Cache_hit; + d_hits = CpuInstruction.D_Cache_hit; + i_misses = CpuInstruction.I_Cache_miss; + d_misses = CpuInstruction.D_Cache_miss; + + /* reset cache stats after reading them (for the next instruction) */ + CpuInstruction.I_Cache_hit = 0; + CpuInstruction.D_Cache_hit = 0; + CpuInstruction.I_Cache_miss = 0; + CpuInstruction.D_Cache_miss = 0; + + /* tracked for every address */ +# if DEBUG_CACHE + if (likely(prev->i_hits < MAX_CPU_PROFILE_VALUE - i_hits)) { + prev->i_hits += i_hits; } else { - prev->misses = MAX_CPU_PROFILE_VALUE; + prev->i_hits = MAX_CPU_PROFILE_VALUE; } -#else - misses = 0; -#endif + if (likely(prev->d_misses < MAX_CPU_PROFILE_VALUE - d_misses)) { + prev->d_misses += d_misses; + } else { + prev->d_misses = MAX_CPU_PROFILE_VALUE; + } +# endif + if (likely(prev->i_misses < MAX_CPU_PROFILE_VALUE - i_misses)) { + prev->i_misses += i_misses; + } else { + prev->i_misses = MAX_CPU_PROFILE_VALUE; + } + if (likely(prev->d_hits < MAX_CPU_PROFILE_VALUE - d_hits)) { + prev->d_hits += d_hits; + } else { + prev->d_hits = MAX_CPU_PROFILE_VALUE; + } + + /* tracking for histogram, check for array overflows */ + if (!(i_hits || i_misses)) { + cpu_profile.i_prefetched++; + } + if (unlikely(i_hits >= MAX_I_HITS)) { + i_hits = warn_too_large("number of CPU instruction cache hits", i_hits, MAX_I_HITS, prev_pc, pc); + } + cpu_profile.i_hit_counts[i_hits]++; + + if (unlikely(i_misses >= MAX_I_MISSES)) { + i_misses = warn_too_large("number of CPU instruction cache misses", i_misses, MAX_I_MISSES, prev_pc, pc); + } + cpu_profile.i_miss_counts[i_misses]++; + + if (unlikely(d_hits >= MAX_D_HITS)) { + d_hits = warn_too_large("number of CPU data cache hits", d_hits, MAX_D_HITS, prev_pc, pc); + } + cpu_profile.d_hit_counts[d_hits]++; + + if (unlikely(d_misses >= MAX_D_MISSES)) { + d_misses = warn_too_large("number of CPU data cache misses", d_misses, MAX_D_MISSES, prev_pc, pc); + } + cpu_profile.d_miss_counts[d_misses]++; +#endif /* ENABLE_WINUAE_CPU */ + if (cpu_callinfo.sites) { collect_calls(prev_pc, counters); } - /* counters are increased after caller info is processed, + /* total counters are increased after caller info is processed, * otherwise cost for the instruction calling the callee * doesn't get accounted to caller (but callee). */ - counters->misses += misses; - counters->cycles += cycles; counters->count++; + counters->cycles += cycles; + counters->i_misses += i_misses; + counters->d_hits += d_hits; #if DEBUG if (unlikely(OpcodeFamily == 0)) { @@ -903,15 +1154,21 @@ void Profile_CpuUpdate(void) } /* catch too large (and negative) cycles for other than STOP instruction */ if (unlikely(cycles > 512 && OpcodeFamily != i_STOP)) { - Uint32 nextpc; - fprintf(stderr, "WARNING: cycles %d > 512:\n", cycles); - Disasm(stderr, prev_pc, &nextpc, 1); + warn_too_large("cycles", cycles, 512, prev_pc, pc); } - if (unlikely(cycles == 0)) { - Uint32 nextpc; - fputs("WARNING: Zero cycles for an opcode:\n", stderr); - Disasm(stderr, prev_pc, &nextpc, 1); +# if !ENABLE_WINUAE_CPU + { + static Uint32 prev_cycles = 0, prev_pc2 = 0; + if (unlikely(cycles == 0 && prev_cycles == 0)) { + Uint32 nextpc; + fputs("WARNING: Zero cycles for successive opcodes:\n", stderr); + Disasm(stderr, prev_pc2, &nextpc, 1); + Disasm(stderr, prev_pc, &nextpc, 1); + } + prev_cycles = cycles; + prev_pc2 = prev_pc; } +# endif #endif } @@ -928,8 +1185,9 @@ static void update_area_item(profile_are return; } area->counters.count += count; - area->counters.misses += item->misses; area->counters.cycles += cycles; + area->counters.i_misses += item->i_misses; + area->counters.d_hits += item->d_hits; if (cycles == MAX_CPU_PROFILE_VALUE) { area->overflow = true; @@ -984,6 +1242,7 @@ static Uint32* index_area(profile_area_t void Profile_CpuStop(void) { Uint32 *sort_arr, next; + unsigned int size, stsize; int active; if (cpu_profile.processed || !cpu_profile.enabled) { @@ -996,15 +1255,25 @@ void Profile_CpuStop(void) } /* user didn't change RAM or TOS size in the meanwhile? */ - assert(cpu_profile.size == (STRamEnd + 0x20000 + TosSize) / 2); + size = stsize = (STRamEnd + CART_SIZE + TosSize) / 2; + if (TTmemory && ConfigureParams.Memory.TTRamSize_KB) { + size += ConfigureParams.Memory.TTRamSize_KB * 1024/2; + } + assert(cpu_profile.size == size); Profile_FinalizeCalls(&(cpu_callinfo), &(cpu_profile.all), Symbols_GetByCpuAddress); /* find lowest and highest addresses executed etc */ next = update_area(&cpu_profile.ram, 0, STRamEnd/2); - next = update_area(&cpu_profile.tos, next, (STRamEnd + TosSize)/2); - next = update_area(&cpu_profile.rom, next, cpu_profile.size); - assert(next == cpu_profile.size); + if (TosAddress < CART_START) { + next = update_area(&cpu_profile.tos, next, (STRamEnd + TosSize)/2); + next = update_area(&cpu_profile.rom, next, stsize); + } else { + next = update_area(&cpu_profile.rom, next, (STRamEnd + CART_SIZE)/2); + next = update_area(&cpu_profile.tos, next, stsize); + } + next = update_area(&cpu_profile.ttram, next, size); + assert(next == size); #if DEBUG if (skip_assert) { @@ -1012,13 +1281,22 @@ void Profile_CpuStop(void) } else #endif { - assert(cpu_profile.all.misses == cpu_profile.ram.counters.misses + cpu_profile.tos.counters.misses + cpu_profile.rom.counters.misses); - assert(cpu_profile.all.cycles == cpu_profile.ram.counters.cycles + cpu_profile.tos.counters.cycles + cpu_profile.rom.counters.cycles); - assert(cpu_profile.all.count == cpu_profile.ram.counters.count + cpu_profile.tos.counters.count + cpu_profile.rom.counters.count); +#if DEBUG + if (cpu_profile.all.count != cpu_profile.ttram.counters.count + cpu_profile.ram.counters.count + cpu_profile.tos.counters.count + cpu_profile.rom.counters.count) { + fprintf(stderr, "ERROR, instruction count mismatch:\n\t%"PRIu64" != %"PRIu64" + %"PRIu64" + %"PRIu64" + %"PRIu64"?\n", + cpu_profile.all.count, cpu_profile.ttram.counters.count, cpu_profile.ram.counters.count, + cpu_profile.tos.counters.count, cpu_profile.rom.counters.count); + fprintf(stderr, "If there was debugger invocation from profiling before this, try with profiler DEBUG define disabled!!!\n"); + } +#endif + assert(cpu_profile.all.count == cpu_profile.ttram.counters.count + cpu_profile.ram.counters.count + cpu_profile.tos.counters.count + cpu_profile.rom.counters.count); + assert(cpu_profile.all.cycles == cpu_profile.ttram.counters.cycles + cpu_profile.ram.counters.cycles + cpu_profile.tos.counters.cycles + cpu_profile.rom.counters.cycles); + assert(cpu_profile.all.i_misses == cpu_profile.ttram.counters.i_misses + cpu_profile.ram.counters.i_misses + cpu_profile.tos.counters.i_misses + cpu_profile.rom.counters.i_misses); + assert(cpu_profile.all.d_hits == cpu_profile.ttram.counters.d_hits + cpu_profile.ram.counters.d_hits + cpu_profile.tos.counters.d_hits + cpu_profile.rom.counters.d_hits); } /* allocate address array for sorting */ - active = cpu_profile.ram.active + cpu_profile.rom.active + cpu_profile.tos.active; + active = cpu_profile.ttram.active + cpu_profile.ram.active + cpu_profile.rom.active + cpu_profile.tos.active; sort_arr = calloc(active, sizeof(*sort_arr)); if (!sort_arr) { @@ -1036,6 +1314,7 @@ void Profile_CpuStop(void) sort_arr = index_area(&cpu_profile.ram, sort_arr); sort_arr = index_area(&cpu_profile.tos, sort_arr); sort_arr = index_area(&cpu_profile.rom, sort_arr); + sort_arr = index_area(&cpu_profile.ttram, sort_arr); assert(sort_arr == cpu_profile.sort_arr + cpu_profile.active); //printf("%d/%d/%d\n", area->active, sort_arr-cpu_profile.sort_arr, active); @@ -1056,7 +1335,7 @@ void Profile_CpuGetPointers(bool **enabl /** * Get callinfo & symbol search pointers for stack walking. */ -void Profile_CpuGetCallinfo(callinfo_t **callinfo, const char* (**get_symbol)(Uint32)) +void Profile_CpuGetCallinfo(callinfo_t **callinfo, const char* (**get_symbol)(Uint32, symtype_t)) { *callinfo = &(cpu_callinfo); *get_symbol = Symbols_GetByCpuAddress;