1
0
Fork 0
mirror of https://github.com/AquariaOSE/Aquaria.git synced 2025-01-24 17:26:41 +00:00

remove iprof and BBGE_PROF (#74)

This commit is contained in:
fgenesis 2022-03-17 20:50:42 +01:00
parent 47f8677aa7
commit 26759c8be1
22 changed files with 3 additions and 2286 deletions

View file

@ -1829,7 +1829,6 @@ const int chkDist = 2500*2500;
Target Avatar::getNearestTarget(const Vector &checkPos, const Vector &distPos, Entity *source, DamageType dt, bool override, std::vector<Target> *ignore)
{
BBGE_PROF(Avatar_getNearestTarget);
Target t;
Vector targetPosition;
@ -5330,8 +5329,6 @@ bool lastCursorKeyboard = false;
void Avatar::onUpdate(float dt)
{
BBGE_PROF(Avatar_onUpdate);
looking = 0;
@ -5492,7 +5489,6 @@ void Avatar::onUpdate(float dt)
lastWaterBubble = waterBubble;
waterBubble = 0;
BBGE_PROF(Avatar_splashOut);
splash(false);
if (dsq->continuity.form != FORM_FISH)

View file

@ -179,7 +179,6 @@ void Element::updateEffects(float dt)
void Element::update(float dt)
{
BBGE_PROF(Element_update);
if (!core->particlesPaused)
{
updateLife(dt);

View file

@ -2546,7 +2546,6 @@ void Entity::addIgnoreShotDamageType(DamageType dt)
void Entity::doSpellAvoidance(float dt, int range, float mod)
{
BBGE_PROF(Entity_doSpellAvoidance);
Vector accum;
int c = 0;

View file

@ -4071,7 +4071,6 @@ bool Game::isEntityCollideWithShot(Entity *e, Shot *shot)
void Game::handleShotCollisions(Entity *e, bool hasShield)
{
BBGE_PROF(Game_handleShotCollisions);
for (size_t i = 0; i < Shot::shots.size(); ++i)
{
Shot *shot = Shot::shots[i];
@ -4103,7 +4102,6 @@ bool Game::isDamageTypeEnemy(DamageType dt)
void Game::handleShotCollisionsSkeletal(Entity *e)
{
BBGE_PROF(Game_HSSKELETAL);
for (size_t i = 0; i < Shot::shots.size(); ++i)
{
Shot *shot = Shot::shots[i];

View file

@ -119,9 +119,6 @@ void Hair::onUpdate(float dt)
void Hair::updatePositions()
{
BBGE_PROF(Hair_updatePositions);
for (size_t i = 1; i < hairNodes.size(); i++)
{
Vector diff = hairNodes[i].position - hairNodes[i-1].position;

View file

@ -330,25 +330,15 @@ void SchoolFish::applySeparation(Vector &accumulator)
void SchoolFish::onUpdate(float dt)
{
BBGE_PROF(SchoolFish_onUpdate);
{
burstDelay -= dt;
if (burstDelay < 0)
{
burstDelay = 0;
}
}
burstDelay -= dt;
if (burstDelay < 0)
burstDelay = 0;
if (stickToNaijasHead && alpha.x < 0.1f)
stickToNaijasHead = false;
if (this->layer < LR_ENTITIES)
{
setEntityType(ET_NEUTRAL);
collideRadius = 0;
}

View file

@ -461,8 +461,6 @@ void ScriptedEntity::stopPull()
void ScriptedEntity::onUpdate(float dt)
{
BBGE_PROF(ScriptedEntity_onUpdate);
CollideEntity::onUpdate(dt);
if (becomeSolidDelay)

View file

@ -450,7 +450,6 @@ void Shot::onEndOfLife()
void Shot::doHitEffects()
{
BBGE_PROF(Shot_doHitEffects);
if (shotData)
{
if (!shotData->hitPrt.empty())
@ -478,7 +477,6 @@ void Shot::suicide()
bool Shot::onHitWall(bool reflect)
{
BBGE_PROF(Shot_onHitWall);
doHitEffects();
updateSegments(position);

View file

@ -31,9 +31,6 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#include "BBGECompileConfig.h"
#define BBGE_PROF(x)
#define compile_assert(pred) switch(0){case 0:case (pred):;}
#ifdef _MSC_VER

View file

@ -1046,8 +1046,6 @@ void Core::run(float runTime)
while((runTime == -1 && !loopDone) || (runTime >0))
{
BBGE_PROF(Core_main);
nowTicks = SDL_GetTicks();
dt = (nowTicks-thenTicks)/1000.0;
thenTicks = nowTicks;
@ -1177,8 +1175,6 @@ void Core::run(float runTime)
showBuffer();
BBGE_PROF(STOP);
if (nestedMains == 1)
clearGarbage();
@ -1741,10 +1737,6 @@ void Core::updateCullData()
void Core::render(int startLayer, int endLayer, bool useFrameBufferIfAvail)
{
BBGE_PROF(Core_render);
if (startLayer == -1 && endLayer == -1 && overrideStartLayer != 0)
{
startLayer = overrideStartLayer;
@ -2048,8 +2040,6 @@ CountedPtr<Texture> Core::doTextureAdd(const std::string &texture, const std::st
CountedPtr<Texture> Core::addTexture(const std::string &textureName)
{
BBGE_PROF(Core_addTexture);
if (textureName.empty())
return NULL;
@ -2192,7 +2182,6 @@ void Core::enqueueRenderObjectDeletion(RenderObject *object)
void Core::clearGarbage()
{
BBGE_PROF(Core_clearGarbage);
// HACK: optimize this (use a list instead of a queue)
for (RenderObjects::iterator i = garbage.begin(); i != garbage.end(); i++)

View file

@ -30,7 +30,6 @@ Emitter::Emitter(ParticleEffect *pe) : Quad(), pe(pe)
void Emitter::destroy()
{
BBGE_PROF(Emitter_destroy);
for (Particles::iterator i = particles.begin(); i != particles.end(); i++)
{
(*i)->active = false;
@ -43,7 +42,6 @@ void Emitter::destroy()
void Emitter::spawnParticle(float perc)
{
BBGE_PROF(Emitter_spawnParticle);
Particle *p = particleManager->getFreeParticle(this);
p->active = true;
@ -231,8 +229,6 @@ void Emitter::render()
void Emitter::onRender()
{
BBGE_PROF(Emitter_onRender);
if (particles.empty()) return;
if (!data.spawnLocal)

View file

@ -39,7 +39,6 @@ void ParticleEffect::setDie(bool v)
void ParticleEffect::load(const std::string &name)
{
BBGE_PROF(ParticleEffect_load);
particleManager->loadParticleEffectFromBank(name, this);
}
@ -443,8 +442,6 @@ void ParticleEffect::stop()
void ParticleEffect::onRender()
{
BBGE_PROF(ParticleEffect_onRender);
RenderObject::onRender();
}

View file

@ -257,7 +257,6 @@ Particle *ParticleManager::stomp()
Particle *ParticleManager::getFreeParticle(Emitter *emitter)
{
BBGE_PROF(ParticleManager_getFreeParticle);
if (size == 0) return 0;
Particle *p = 0;
@ -354,7 +353,6 @@ int ParticleManager::getSize()
void ParticleManager::update(float dt)
{
BBGE_PROF(ParticleManager_update);
numActive = 0;
for (size_t i = 0; i < particles.size(); i++)
{

View file

@ -1,617 +0,0 @@
IPROF: A Portable Industrial-Strength Interactive Profiler for C++ and C
by Sean Barrett
Version 0.2
CONTENTS
Overview
User Manual
Platform
Instrumentation
Private Zones
Public Zones
Initialization
Processing Data
Displaying Results
Controlling Display
Understanding CALL GRAPH output
Performance Expectation
Implementation Notes
Version History
OVERVIEW
IProf is an interactive profiler which works by intrusively instrumenting
code. Code is divided into zones by programmer-inserted statements. Zones
are both lexically and dynamically scoped--all time spent within a
lexically scoped zone, and any code which it calls which is not itself
zoned, is attributed to that zone.
Profiling occurs interactively; time is divided into "frames", and the
profiler shows time spent on the previous frame (or a smoothed average
or possibly even a frame a second or two ago).
Like a traditional profiler, IProf records or can compute the number of
times a zone is entered, the amount of time spent in the zone ("self
time"), and the amount of time spent in the zone and its descendents
("hierarchical time" -- "self + child time" in gprof).
Furthermore, IProf computes information along the lines of gprof--number of
times a given zone is entered from any other specific zone; self and
hierarchical time spent in a given zone on behalf of a specific parent
zone, etc. (However, where gprof estimates this information based only on
call counts, IProf measures the actual values. So, for example, IProf will
accurately report if a ray-casting routine called by both physics and AI
always spends longer per AI-call because the casts are longer.)
Precise information is available for recursive routines, including call
depths etc. [The current version of IProf does not yet completely handle
reporting of recursive data, although it is measured correctly.]
Additionally, IProf provides all numbers in instantaneous form or as two
differently weighted moving averages. It's easy to pause the profile
updating so that you can switch between multiple views of the paused data
set. Two optional flags allow trading off memory for deeper historical
views. The cheaper option provides only zone-self-time history, suitable
for a real-time graph of behavior. The more memory-expensive flag keeps a
history of all the data for a certain number of frames, allowing full
profile analysis of old frames.
IProf is designed for its monitoring/gathering mode to be "always on", even
in release/optimized builds. The monitoring routines are designed to be
reasonably efficient--the full hash on every function entry required by
gprof is avoided in most cases--and the programmer can minimize the impact
by limiting the instrumentation to relatively large routines. (One could
certainly instrument a vector add function and possibly get useful call
count data from it, but the monitoring overhead would be significant and
noticeable in that case.) In combination with history information, it
becomes possible to run an application, notice poor behavior, pause the
(always on) profiling and the application, and start browsing through the
historical profiling information.
IProf uses both per-call monitoring and a separate per-frame
gathering/analysis phase. The latter is itself instrumented so the overhead
due to it is easy to see.
USER MANUAL
These sections document the necessary code you must use and code changes
you must make to use the profiler.
The profiling system expects to be able to use any identifier which is
prefixed with "Prof_" with exactly that pattern of uppercase/lowercase
(i.e. "PROF_" and "prof_" can be used freely by other code).
COMPILING THE PROFILING SYSTEM
The profiler was developed using MSVC 6.0, but should be reasonably
portable. The implementation files are provided as .C files so they can be
used with C compilers; however, they can be renamed to C++ files and
compiled in that form. The implementations automatically insert extern "C"
on the public routines. Internal routines will use either C or C++ linkage
depending on which way you compile them; you must compile all the profiler
files as either C or C++, without intermixing.
[[ NOTE: Originally the code was written in C++, and then it was
converted to compile with C, and then some additional small changes
were made. As of this writing, I haven't actually tested compiling
everything as C++ again. Feel free to test for me. Or just compile
the C files as C--you can still USE the C++ interfaces fine.]]
Needed files:
prof_win32.c -- Win32 implementation of seconds-based timer
prof_gather.c -- raw data collection
prof_process.c -- high-level data collection, report generator
prof_draw.c -- opengl rendering interface
prof.h -- public front-end
prof_win32.h -- Win32 implementation of fast integer timestamp
prof_gather.h -- instrumentation macros (included by prof.h)
prof_internal.h -- private interfaces
PLATFORM SUPPORT
IProf requires a small amount--less than fifty lines--of platform-specific
code.
Win32 under MSVC is automatically supported with no further effort on your
part, using the files prof_win32.c and prof_win32.h
To use other platforms, just create equivalent files for your platform. The
C file contains a routine for getting an accurate floating point time
reading; the H file contains the definition of a 64-bit integer type and a
fast routine for reading a timestamp of that size. If 64-bit math isn't
available on your platform, or if your timestamp is only 32-bit, you can
replace the 64-bit type with a 32-bit type, as long as that item won't
overflow in the course of running the application. (A 31-bit millisecond
timer is good for 24 days, but is very imprecise for this application.) If
reading the timestamp is slow, you will want to minimize how often the zone
entry and exit points are called.
Also required is a display interface; an opengl one is provided, although
others would be easy to code. (The primary display is purely textual, and
is available through a text interface.)
INSTRUMENTATION
First, #include "prof.h" in files that need profiling.
The flag Prof_ENABLED determines whether the monitoring code is compiled or
not, to make it easy to turn off all profiling code for final shippable
builds. Additional flags controlling amount of history data and memory
usage therein are defined at the top of the file prof_process.c and should
just be changed there since they affect no other files.
There are two main ways of instrumenting, and each offers a C++ interface
and a C interface.
Private zones
C++ Prof(zone);
C Prof_Begin(zone)
Prof_End
Public zones
Prof_Define(zone);
Prof_Declare(zone);
C++ Prof_Scope(zone);
C Prof_Region(zone)
Prof_End
Zone names--indicated by "zone" above--must obey the rules for identifiers,
although they can begin with a number, and they exist in a separate
namespace from regular identifiers.
So these are valid zone names:
my_zone_2
2_my_zone
__
And these are NOT valid zone names:
"my_zone"
my_class::my_zone
PRIVATE ZONES
The simplest, and highly recommended, approach to instrumentation is to
create a private zone which only exists in a single location. In C++, you
do this by declaring a lexically scoped zone with a statement which behaves
semantically like a variable declaration:
// C++ instrumentation
void my_routine()
{
Prof(my_routine_name);
... my code ...
}
This will cause all time spent after Prof(my_routine_name) to accumulate in
a zone in the profiling reports labeled "my_routine_name". The zone ends
when the name goes out of scope, that is, when a destructor would be called
corresponding to this declaration.
Zones don't have to appear at routine-level function scope; for example:
// C++ instrumentation
void my_routine()
{
Prof(my_routine);
... // zone my_routine
if (...)
{
Prof(my_routine_special_case);
... // zone my_routine_special_case
}
... // zone my_routine
}
Instrumenting in C requires more work, because C doesn't provide
destructors, so it's not possible to lexically scope zones automatically.
Instead, the programmer must insert Begin/End pairs and make sure those
pairs are accurately balanced. All paths out of a function must be
accounted for. A crash or severe slowdown is likely to occur with
unbalanced pairs.
// C instrumentation
void my_routine(void)
{
Prof_Begin(my_routine)
int x = some_func();
if (x == 0) {
Prof_End
return;
}
...
Prof_End
}
Prof_Begin() is declaration-like; however, it takes no trailing semicolon.
(This is necessary so it can be compiled out; C doesn't allow the empty
statement ";" to precede variable declarations.) Prof_End takes no
trailing semicolon or parentheses to help remind you of this. (You can
change the definition of Prof_End in prof_gather.h if you don't like that.)
Profiling instructions like Prof() and Prof_Begin() can be placed anywhere
that variable declarations are legal; generally you want to define them
before other variables so the variable initializations are profiled.
The C interfaces are also available in C++ if you should want to use a not-
exactly-lexically-scoped zone, e.g. end a zone before the destructor would
go out of scope. (You can't, however, end Prof() with Prof_End.)
PUBLIC ZONES
If you define multiple private zones with the same name, they will be
treated as entirely unrelated zones that happen to have the same name, and
you will see the same name multiple times in the profiling output.
Instead, you probably want to use public zones, to use the same zone in
multiple regions of code. For example, we might have two routines that
serve the same purpose which we always want to measure as one. Or we might
have two blocks of code within a single routine which we want to credit to
the same zone.
To do this, first define the zone with Prof_Define(zone), and then use it
with Prof_Scope(zone) [C++] or Prof_Region(zone) ... Prof_End [C].
// C++ instrumentation
Prof_Define(my_routine);
void my_routine_v1()
{
Prof_Scope(my_routine);
...
}
void my_routine_v2()
{
Prof_Scope(my_routine);
...
}
or
// C instrumentation
Prof_Define(my_routine);
void my_routine_v1(void)
{
Prof_Region(my_routine)
...
Prof_End
}
void my_routine_v2(void)
{
Prof_Region(my_routine)
...
Prof_End
}
Because Prof_Define defines an actual global symbol (if used at file
scope), the symbol can even be referenced from other files by saying:
extern Prof_Declare(my_routine);
void my_routine()
{
Prof_Scope(my_routine);
}
You can use 'extern "C" Prof_Declare()' or Prof_Define() to share a zone
between C and C++ code.
USER MANUAL - INIIALIZATION
The profiling system is self-initializing.
USER MANUAL - PROCESSING DATA
Every frame, you should call Prof_update(). Prof_update() will gather
results and record frame-history information on the assumption that each
call is a frame. Prof_update() takes a boolean flag which indicates whether
to update the history or not; passing in false means profiling is "paused"
and doesn't change.
You might wire this to its own toggle, or you might simply pass in a pre-
existing flag for whether the simulation itself is active or not, thus
allowing you to pause the simulation and automatically pause the profiling.
(On the other hand, if you're profiling a renderer, you might want to
pause the simulation and keep profiling.)
USER MANUAL - DISPLAYING RESULTS
IProf offers two separate types of display: the report, which is primarily
textual, and the graph, which is entirely graphical.
If you're using OpenGL, output is straightforward. For the text report,
call Prof_draw_gl() with the display set to a 2d rendering mode--one that
can use integer addressing, e.g. integers the size of pixels, virtual
pixels (e.g. a 640x480 screen regardless of actual dimension), or even
characters. Set the blending state to whatever blending mode you want for
the report display. For the graphics report, call Prof_draw_graph_gl().
Details of the parameters to these functions are available in the header
file.
For other output devices (Direct3D, text), you'll have to write your own
functions equivalent to Prof_draw_gl() and Prof_draw_graph_gl(). These
should not be too difficult; these functions don't compute any of the
profiling information; they simply format a text report or dataset to the
screen. The text report format consists of several title fields to be
printed, and then a collection of data records. Each data record has a name
and an indentation amount for that name (used for call graph
parent/children formatting), a collection of unnamed data "values", and a
flag field indicating which of the data values should be displayed.
Additionally, data records have a "heat" which indicates how rapidly
changing they are, and one record may be "highlighted" indicating a virtual
cursor is on that line.
[[ In practice, Prof_draw_gl makes few enough GL calls that maybe it's
worth modularizing things out further. ]]
USER MANUAL - CONTROLLING DISPLAY
IProf features some easy-to-use UI elements that allow program-direct
control or user-interaction-based control over what data is reported.
Simply hook these calls up to hotkey presses to complete your working
profile system. (You could even write code to support mouse clicking on the
report by calling Prof_set_cursor and on the graph by calling
Prof_set_frame, but the hit detection is up to you.)
These are in rough order of the priority with which you might want to
implement them.
Most important
Prof_set_report_mode(enum ...)
Selects what to show in the report:
Prof_SELF_TIME: flat times sorted by self time
Prof_HIERARCHICAL_TIME: flat times sorted by hierarchical time
Prof_CALL_GRAPH: call graph parent/children information
Prof_move_cursor(int delta)
Move the cursor up-or-down by delta lines
Prof_select(void)
Switch to call graph mode on whichever zone is currently selected
Prof_select_parent(void)
Go to largest-hierarchical-time parent of the active zone in
the call graph. (Roughly like "go up a directory".)
Important if you support history
Prof_move_frame(int delta)
Move backwards or forwards in history by delta frames
Not too important
Prof_set_average(int type)
Selects which moving average to use (0 == instantaneous, 1=default);
only meaningful if frame# = 0; when looking at history, instantaneous
values are always used.
Prof_set_frame(int frame)
Selects which history entry to view (0==current, 1==previous, etc.)
Prof_set_cursor(int pos)
Set the position of the up-and-down cursor.
Prof_set_recursion(enum ...)
Selects whether to show recursive routines as a single zone or
as a series of distinct zones for each recursion level.
[[ currently unimplemented ]]
UNDERSTANDING CALL GRAPH OUTPUT
The call graph output focuses on a single zone, and provides information
about the parents (callers) and children (callees) of that zone.
The general format is something like this:
zone self hier count
+my_parent1 0.75 2.50 4.0
+my_parent2 1.00 3.25 6.0
-my_routine 1.75 5.75 10.0
+my_child1 1.00 2.00 15.0
+my_child2 0.25 1.50 500.0
my_child3 0.50 0.50 3.0
"self" indicates self-time (time in this zone), "hier" is hierarchical-time
(time in this zone or its descendents), and "count" is the number of times
the zone was entered. (Entry counts are inherently integral, but are shown
as floating point since they may be a moving average of several integers.)
Currently the zone "my_routine" is being examined. It accounts for 5.75
milliseconds of time between itself and the zones it calls. 1.75ms are
spent in itself. The zone was entered (called) 10 times this frame.
The difference between my_routine's self time and hierarchical time is
4.00ms; that much time must be being spent in its descendents. Its
immediate children--the zones that my_routine calls directly--appear below
it on the table. The hierarchical times of each child represents the time
spent in that child and all its descendents *on behalf of my_routine*--
other calls to that child are not counted. Thus, the sum of all the
children's hierarchical time should account for all time spent in
descendents of my_routine; hence, the sum of the child hier times is 4.00,
identical to the difference between self and hier for my_routine.
Above "my_routine" in the chart is information about the callers of
my_routine. However, the timings and counts in this section are not the
self and hierarchical times of the parent functions themselves--there is no
sensible meaning of "on behalf of my_routine" for the parents. Instead, the
self, hier, and count fields show the time spent *in my_routine* on behalf
of those parents. Thus, for each field, all of the parent entries sum to
the corresponding entry in my_routine. Again, these are computed exactly.
If my_routine was the public interface to a raycaster called by both AI and
physics, but it passed the raycast on to further routines which were
themselves explicitly zoned, then most of the my_routine time would be
spent in descendents. This would show up in the "hierarchical time" field,
and the parent zones, AI and physics, would show that hierarchical time
attributed accurately.
There is additional data available in the system--it would be possible to
drill down into lower-level functions and still attribute them to zones
several parent levels above; there just isn't currently any user interface
or computation functionality to do it.
PERFORMANCE EXPECTATION
Except for recursive routines (see Implementation Notes section), the
expected performance on zone entry comes from running roughly the following
code:
extern Something *p0,*p1;
if (p0->ptr_field != p1) { ... /* rarely runs */ }
p0->int64_field0 = RDTSC; // read timestamp counter
p0->int32_field += 1;
p1->int64_field1 += p0->int64_field0 - p1->int64_field0;
p1 = p0;
Zone exit costs a bit less.
IMPLEMENTATION NOTES
IProf uses two relatively unknown techniques to produce accurate call
information with minimal overhead. The first technique produces accurate
call information at a similar cost to gprof's mcount monitoring; the second
reduces the overhead.
_Zone Stack Tracking_
gprof's mcount technique combines two separate measurements. At every
function entry, the function and the caller (grabbed from the return
address on the stack) are hashed to determine a unique "data-gathering
slot", and an integer in that slot is incremented. Thus, exact pairwise
call counts are computed. Simultaneously, gprof periodically samples the
instruction pointer to measure the time spent in any given routine--"self
times". Hierarchical times are computed by distributing the self times up
the tree based on the call graph counts. (If routine X is called 9 times
from routine Z, and one time from routine Y, then 90% of X's time is
attributed to Z, and 10% to Y.)
An intrusive profiler which samples a timer at zone entry and again at zone
exit will compute accurate hierarchical times. By keeping a stack of zones,
it's possible to compute accurate hierarchical and self times. The stack of
zones also provides caller information, so hierarchical and self times can
be attributed to each unique pair of caller & callee zones (via hashing).
This will allow much more accurate attribution. In fact, it is sufficient
to compute exact values for all the information gprof outputs, except in
the face of recursion. Performance is fairly good; unlike a single-zone
intrusive profiler, which must measure both self and hierarchical time,
since neither can be derived from the other, the zone-pair profiler can
only measure hierarchical time; self-time can be derived from hierarchical
time (but not vice versa).
A further improvement is, instead of having one data-gathering slot per
zone--that is, representing the state of the top of the zone stack--and
instead of having one data-gathering slot per caller/callee zone pair--that
is, representing the state of the top two entries of the zone stack--to
have one data-gathering slot per unique full stack state. This can be done
straightforwardly by building the stack as a linked list (creating an
inverted tree--a tree of all stack states with only parent-pointer links),
and hashing the "zone to be pushed" and the current stack to find the new
stack. Thus the cost of the hash computation is essentially identical to
the previous case. If every zone is only called from one specific place,
there will still only be one data-gathering slot per zone; if a routine is
recursive, it will create a large number of data-gathering slots, one for
each depth of recursion. (A complex mutually recursive program like a
compiler might generate an unreasonable number of unique states.)
With zone-stack tracking, it's possible to measure only either hierarchical
time or self-time and derive the other. Hierarchical time is actually more
efficient to measure, but it leaves handling the top-level overarching
global state as a special case (since it will have a timer that starts but
never ends). It's easier to instead measure self-time and rederive
hierarchical time. Moreover, a recursive routine will automatically
"overcount" hierchical time (it's accrued at each level of the hierarchy),
requiring significant fixup. It's more straightforward to just compute the
recursive data correctly from the self times in the first place.
_Hash Cacheing_
Although the hash lookup described above is coded to proceed as quickly as
possible if the hash hits on the first probe, it still requires enough
computation and a function call that it is worth avoiding if possible. To
that end, each zone-entry location declares a hidden static variable
private to that zone-entry point which caches the hash lookup. At zone-
entry, the code checks the cache's "next node in the linked list" field
with the current stack state. If the two are equal, then the cache is
valid, and no hash lookup occurs. If it does not much, then the cache is
wrong, and the hash lookup proceeds, and updates the cache. The cache is
initialized to a impossible value, so the first time the code is run a hash
lookup always occurs.
The result is that in the normal case, a routine called from a single
place, the cache is always valid (after the very first call). Furthermore,
the branch will always predict correctly, since it always branches
identically. However, for a routine that is called from several places,
there is a "switching" overhead each time it's called from a different
place. So, for example, a raycaster called by both physics and AI might pay
the overhead twice per frame, if all the AI calls occur before all the
physics calls. However, a common low-level routine (e.g. a vector add)
called alternately from two different zones would have to perform the hash
lookup every time.
The actual common "failure" case is a recursive routine, for which, each
time the routine is entered, the state of the call stack is different from
the last time, thus almost always paying the hash lookup case. For
something like a recursive linked list traversal, the hash occurs every
time. (It doesn't matter if the routine is tail-recursive; once you insert
the profiling instrumentation, it's no longer tail-recursive.) A full
binary tree traversal will always enter a different zone-stack-state from
last time, except after reaching a left-child leaf. (The recursion then
returns and then goes down to the right child, which is at the same height
as the left child.) So a full binary tree traversal will have to hash about
3/4 of the time. A full quadtree traversal will have to hash about 2/5 of
the time. If the traversal is doing anything complicated, this should not
be a problem; but if it's a simple traversal, the performance overhead may
be significant. Like the vector add case, it may be better to remove
instrumentation from low-inherent-cost recursive routines except when
absolutely needed. Of course, it's easy enough to compare performance
behavior before and after adding the instrumenting and see if the overhead
is acceptable.
VERSION HISTORY
version 0.2 -- 2003-02-06 STB
- Significant interface changes to Prof_draw_gl:
- accepts floating point instead of int for 2d screen metrics
- accepts a total width and height of the display and conforms
to that
- accepts a precision specification for display of time values
- added little '+' and '-' signs reminiscent of list displays
so you know which ones can be drilled down on
- expanded this doc's description of what's legal for a zone-name
- fixed an error trying to compile the C files as C++
- added Prof_select_parent() for moving up the tree
version 0.1 -- 2003-02-05 STB
- First public version, heavily refactored, 1500 lines
- win32 timing interface and smooth "moving average" code derived
from Jonathan Blow's Game Developer Magazine articles
- missing functionality:
- correct attribution of time to zones that are parents of
recursive zones in call graph view (hierarchical times don't
bubble up correctly)
- spread recursion display (displaying each depth of a recursive
zone as if it were a separate zone)

View file

@ -1,94 +0,0 @@
#ifndef Prof_INC_PROF_H
#define Prof_INC_PROF_H
//#define Prof_ENABLED
#include "prof_gather.h"
#ifdef __cplusplus
extern "C" {
#endif
/*
* Prof_update
*
* Pass in true (1) to accumulate history info; pass
* in false (0) to throw away the current frame's data
*/
extern void Prof_update(int record);
/*
* Prof_draw_gl -- display the current report via OpenGL
*
* You must provide a callable text-printing function.
* Put the opengl state into a 2d rendering mode.
*
* Parameters:
* <sx,sy> -- location where top line is drawn
* <width, height> -- total size of display (if too small, text will overprint)
* line_spacing -- how much to move sy by after each line; use a
* negative value if y decreases down the screen
* precision -- decimal places of precision for time data, 1..4 (try 2)
* print_text -- function to display a line of text starting at the
* given coordinate; best if 0,1..9 are fixed-width
* text_width -- a function that computes the pixel-width of
* a given string before printing. you can fake with a
* simple approximation of width('0')*strlen(str)
*
* to avoid overprinting, you can make print_text truncate long strings
*/
extern void Prof_draw_gl(float sx, float sy,
float width, float height,
float line_spacing,
int precision,
void (*print_text)(float x, float y, char *str),
float (*text_width)(char *str));
/*
* Parameters
* <sx, sy> -- origin of the graph--location of (0,0)
* x_spacing -- screenspace size of each history sample; e.g.
* 2.0 pixels
* y_spacing -- screenspace size of one millisecond of time;
* for an app with max of 20ms in any one zone,
* 8.0 would produce a 160-pixel tall display,
* assuming screenspace is in pixels
*/
extern void Prof_draw_graph_gl(float sx, float sy,
float x_spacing, float y_spacing);
typedef enum
{
Prof_SELF_TIME,
Prof_HIERARCHICAL_TIME,
Prof_CALL_GRAPH,
} Prof_Report_Mode;
extern void Prof_set_report_mode(Prof_Report_Mode e);
extern void Prof_move_cursor(int delta);
extern void Prof_select(void);
extern void Prof_select_parent(void);
extern void Prof_move_frame(int delta);
extern void Prof_set_smoothing(int smoothing_mode);
extern void Prof_set_frame(int frame);
extern void Prof_set_cursor(int line);
typedef enum
{
Prof_FLATTEN_RECURSION,
Prof_SPREAD_RECURSION
} Prof_Recursion_Mode;
extern void Prof_set_recursion(Prof_Recursion_Mode e);
#ifdef __cplusplus
}
#endif
#endif // Prof_INC_PROF_H

View file

@ -1,334 +0,0 @@
#ifdef WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif
#include <gl/gl.h>
#include <stdio.h>
#include <stdlib.h>
#include "prof.h"
#include "prof_internal.h"
#pragma warning(disable:4305; disable:4244)
// use factor to compute a glow amount
static int get_colors(float factor,
float text_color_ret[3],
float glow_color_ret[3],
float *glow_alpha_ret)
{
const float GLOW_RANGE = 0.5f;
const float GLOW_ALPHA_MAX = 0.5f;
float glow_alpha;
int i;
float hot[3] = {1, 1.0, 0.9};
float cold[3] = {0.15, 0.9, 0.15};
float glow_cold[3] = {0.5f, 0.5f, 0};
float glow_hot[3] = {1.0f, 1.0f, 0};
if (factor < 0) factor = 0;
if (factor > 1) factor = 1;
for (i=0; i < 3; ++i)
text_color_ret[i] = cold[i] + (hot[i] - cold[i]) * factor;
// Figure out whether to start up the glow as well.
glow_alpha = (factor - GLOW_RANGE) / (1 - GLOW_RANGE);
if (glow_alpha < 0) {
*glow_alpha_ret = 0;
return 0;
}
for (i=0; i < 3; ++i)
glow_color_ret[i] = glow_cold[i] + (glow_hot[i] - glow_cold[i]) * factor;
*glow_alpha_ret = glow_alpha * GLOW_ALPHA_MAX;
return 1;
}
static void draw_rectangle(float x0, float y0, float x1, float y1)
{
// FACE_CULL is disabled so winding doesn't matter
glVertex2f(x0, y0);
glVertex2f(x1, y0);
glVertex2f(x1, y1);
glVertex2f(x0, y1);
}
typedef struct
{
float x0,y0;
float sx,sy;
} GraphLocation;
static void graph_func(int id, int x0, int x1, float *values, void *data)
{
GraphLocation *loc = (GraphLocation *) data;
int i, r,g,b;
// trim out values that are under 0.2 ms to accelerate rendering
while (x0 < x1 && (*values < 0.0002f)) { ++x0; ++values; }
while (x1 > x0 && (values[x1-1-x0] < 0.0002f)) --x1;
if (id == 0)
glColor4f(1,1,1,0.5);
else {
if (x0 == x1) return;
id = (id >> 8) + id;
r = id * 37;
g = id * 59;
b = id * 45;
#pragma warning(disable:4761)
glColor3ub((r & 127) + 80, (g & 127) + 80, (b & 127) + 80);
}
glBegin(GL_LINE_STRIP);
if (x0 == x1) {
float x,y;
x = loc->x0 + x0 * loc->sx;
y = loc->y0 + values[0] * loc->sy;
glVertex2f(x,loc->y0);
glVertex2f(x, y);
}
for (i=0; i < x1-x0; ++i) {
float x,y;
x = loc->x0 + (i+x0) * loc->sx;
y = loc->y0 + values[i] * loc->sy;
glVertex2f(x,y);
}
glEnd();
}
Prof_extern_C void Prof_draw_graph_gl(float sx, float sy, float x_spacing, float y_spacing)
{
#ifdef Prof_ENABLED
Prof_Begin(iprof_draw_graph)
GraphLocation loc = { sx, sy, x_spacing, y_spacing * 1000 };
Prof_graph(128, graph_func, &loc);
Prof_End
#endif
}
// float to string conversion with sprintf() was
// taking up 10-20% of the Prof_draw time, so I
// wrote a faster float-to-string converter
static char int_to_string[100][4];
static char int_to_string_decimal[100][4];
static char int_to_string_mid_decimal[100][4];
static void int_to_string_init(void)
{
int i;
for (i=0; i < 100; ++i) {
sprintf(int_to_string[i], "%d", i);
sprintf(int_to_string_decimal[i], ".%02d", i);
sprintf(int_to_string_mid_decimal[i], "%d.%d", i/10, i % 10);
}
}
static char *formats[5] =
{
"%.0f",
"%.1f",
"%.2f",
"%.3f",
"%.4f",
};
static void float_to_string(char *buf, float num, int precision)
{
int x,y;
switch(precision) {
case 2:
if (num < 0 || num >= 100)
break;
x = num;
y = (num - x) * 100;
strcpy(buf, int_to_string[x]);
strcat(buf, int_to_string_decimal[y]);
return;
case 3:
if (num < 0 || num >= 10)
break;
num *= 10;
x = num;
y = (num - x) * 100;
strcpy(buf, int_to_string_mid_decimal[x]);
strcat(buf, int_to_string_decimal[y]+1);
return;
case 4:
if (num < 0 || num >= 1)
break;
num *= 100;
x = num;
y = (num - x) * 100;
buf[0] = '0';
strcpy(buf+1, int_to_string_decimal[x]);
strcat(buf, int_to_string_decimal[y]+1);
return;
}
sprintf(buf, formats[precision], num);
}
Prof_extern_C void Prof_draw_gl(float sx, float sy,
float full_width, float height,
float line_spacing, int precision,
void (*printText)(float x, float y, char *str), float (*textWidth)(char *str))
{
#ifdef Prof_ENABLED
Prof_Begin(iprof_draw)
int i,j,n,o;
GLuint cull, texture;
float backup_sy;
float field_width = textWidth("5555.55");
float name_width = full_width - field_width * 3;
float plus_width = textWidth("+");
int max_records;
Prof_Report *pob;
if (!int_to_string[0][0]) int_to_string_init();
if (precision < 1) precision = 1;
if (precision > 4) precision = 4;
// disable face culling to avoid having to get winding correct
texture = glIsEnabled(GL_TEXTURE_2D);
cull = glIsEnabled(GL_CULL_FACE);
if (cull == GL_TRUE) {
glDisable(GL_CULL_FACE);
}
pob = Prof_create_report();
for (i=0; i < NUM_TITLE; ++i) {
if (pob->title[i]) {
float header_x0 = sx;
float header_x1 = header_x0 + full_width;
if (i == 0)
glColor4f(0.1f, 0.3f, 0, 0.85);
else
glColor4f(0.2f, 0.1f, 0.1f, 0.85);
glBegin(GL_QUADS);
draw_rectangle(header_x0, sy-2, header_x1, sy-line_spacing+2);
glEnd();
if (i == 0)
glColor4f(0.6, 0.4, 0, 0);
else
glColor4f(0.8f, 0.1f, 0.1f, 0);
printText(sx+2, sy, pob->title[i]);
sy += 1.5*line_spacing;
height -= abs(line_spacing)*1.5;
}
}
max_records = height / abs(line_spacing);
o = 0;
n = pob->num_record;
if (n > max_records) n = max_records;
if (pob->hilight >= o + n) {
o = pob->hilight - n + 1;
}
backup_sy = sy;
// Draw the background colors for the zone data.
glDisable(GL_TEXTURE_2D);
glBegin(GL_QUADS);
glColor4f(0,0,0,0.85);
draw_rectangle(sx, sy, sx + full_width, sy - line_spacing);
sy += line_spacing;
for (i = 0; i < n; i++) {
float y0, y1;
if (i & 1) {
glColor4f(0.1, 0.1f, 0.2, 0.85);
} else {
glColor4f(0.1f, 0.1f, 0.3, 0.85);
}
if (i+o == pob->hilight)
glColor4f(0.3f, 0.3f, 0.1f, 0.85);
y0 = sy;
y1 = sy - line_spacing;
draw_rectangle(sx, y0, sx + full_width, y1);
sy += line_spacing;
}
glEnd();
sy = backup_sy;
glColor4f(0.7,0.7,0.7,0);
if (pob->header[0])
printText(sx+8, sy, pob->header[0]);
for (j=1; j < NUM_HEADER; ++j)
if (pob->header[j])
printText(sx + name_width + field_width * (j-1) +
field_width/2 - textWidth(pob->header[j])/2, sy, pob->header[j]);
sy += line_spacing;
for (i = 0; i < n; i++) {
char buf[256], *b = buf;
Prof_Report_Record *r = &pob->record[i+o];
float text_color[3], glow_color[3];
float glow_alpha;
float x = sx + textWidth(" ") * r->indent + plus_width/2;
if (r->prefix) {
buf[0] = r->prefix;
++b;
} else {
x += plus_width;
}
if (r->number)
sprintf(b, "%s (%d)", r->name, r->number);
else
sprintf(b, "%s", r->name);
if (get_colors(r->heat, text_color, glow_color, &glow_alpha)) {
glColor4f(glow_color[0], glow_color[1], glow_color[2], glow_alpha);
//printText(x+2, sy-1, buf);
printText(x+1, sy, buf);
}
glColor3fv(text_color);
printText(x + 1, sy, buf);
for (j=0; j < NUM_VALUES; ++j) {
if (r->value_flag & (1 << j)) {
int pad;
float_to_string(buf, r->values[j], j == 2 ? 2 : precision);
pad = field_width- plus_width - textWidth(buf);
if (r->indent) pad += plus_width;
printText(sx + pad + name_width + field_width * j, sy, buf);
}
}
sy += line_spacing;
}
Prof_free_report(pob);
if (cull == GL_TRUE)
glEnable(GL_CULL_FACE);
if (texture == GL_TRUE)
glEnable(GL_TEXTURE_2D);
Prof_End
#endif
}

View file

@ -1,166 +0,0 @@
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "prof.h"
#include "prof_internal.h"
Prof_Define(_global);
Prof_Zone_Stack Prof_dummy ; // impossible parent
Prof_Zone_Stack Prof_dummy2 ;
Prof_Zone_Stack *Prof_stack = &Prof_dummy2;
int Prof_num_zones;
Prof_Zone *Prof_zones[];
#define MAX_HASH_SIZE 65536 // not unlimited, to catch unbalanced BEGIN/END_PROF
#define INIT_HASH_SIZE 256 // balance resource usage and avoid initial growth
static Prof_Zone_Stack *init_hash[] = { &Prof_dummy };
static Prof_Zone_Stack **zone_hash = init_hash;
static int zone_hash_count = 1;
static int zone_hash_max = 1;
static int zone_hash_mask = 0;
static int hash(Prof_Zone *z, Prof_Zone_Stack *s)
{
int n = (int) z + (int) s;
return n + (n >> 8);
}
static void insert_node(Prof_Zone_Stack *q)
{
int h = hash(q->zone, q->parent);
int x = h & zone_hash_mask;
int s = ((h << 4) + (h >> 4)) | 1;
while (zone_hash[x] != &Prof_dummy)
x = (x + s) & zone_hash_mask;
zone_hash[x] = q;
++zone_hash_count;
}
static void init_zone(Prof_Zone *zone)
{
Prof_zones[Prof_num_zones++] = zone;
zone->initialized = 1;
}
static int count_recursion_depth(Prof_Zone_Stack *stack, Prof_Zone *zone)
{
int n=0;
while (stack) {
if (stack->zone == zone)
++n;
stack = stack->parent;
}
return n;
}
static Prof_Zone_Stack *createStackNode(Prof_Zone *zone, Prof_Zone_Stack *parent)
{
// create a new node
Prof_Zone_Stack *z = (Prof_Zone_Stack *) malloc(sizeof(*z));
z->zone = zone;
z->parent = parent;
z->total_entry_count = 0;
z->total_hier_ticks = 0;
z->total_self_ticks = 0;
z->t_self_start = 0;
z->highlevel = NULL;
z->recursion_depth = count_recursion_depth(parent, zone);
return z;
}
static void init_zone_hash(int size)
{
int i;
assert(size <= MAX_HASH_SIZE);
zone_hash_max = size;
zone_hash_count = 0;
zone_hash = (Prof_Zone_Stack **) malloc(sizeof(*zone_hash) * zone_hash_max);
zone_hash_mask = size-1;
for (i=0; i < zone_hash_max; ++i)
zone_hash[i] = &Prof_dummy;
}
static void Prof_init_lowlevel(void);
// this code is structured to minimize computation
// assuming there's a hit in the very first slot
Prof_extern_C Prof_Zone_Stack *Prof_StackAppend(Prof_Zone *zone)
{
int h = hash(zone, Prof_stack), s;
int x = h & zone_hash_mask;
Prof_Zone_Stack *z = zone_hash[x];
if (z->parent == Prof_stack && z->zone == zone) return z;
if (z != &Prof_dummy) {
// compute a secondary hash function; force it to be odd
// so it's relatively prime to the power-of-two table size
s = ((h << 4) + (h >> 4)) | 1;
for(;;) {
x = (x + s) & zone_hash_mask;
z = zone_hash[x];
if (z->parent == Prof_stack && z->zone == zone) return z;
if (z == &Prof_dummy) break;
}
// loop is guaranteed to terminate because the hash table is never full
}
// now's as good a time as any to initialize this zone
if (!zone->initialized) {
if (zone_hash_max == 1) {
Prof_init_lowlevel();
// the above is reentrant since it initializes _global
// so now invariants are broken, so start over
return Prof_StackAppend(zone);
}
init_zone(zone);
}
// check if we need to grow the table
// we keep it at most 1/2 full to be very fast
if (zone_hash_count*2 > zone_hash_max) {
Prof_Zone_Stack **old_hash = zone_hash, *z;
int i,n = zone_hash_max;
init_zone_hash(zone_hash_max*2);
for (i=0; i < n; ++i)
if (old_hash[i] != &Prof_dummy)
insert_node(old_hash[i]);
z = createStackNode(zone, Prof_stack);
insert_node(z);
return z;
}
// insert new entry in hash table
++zone_hash_count;
return zone_hash[x] = createStackNode(zone, Prof_stack);
}
void Prof_traverse(void (*func)(Prof_Zone_Stack *z))
{
int i;
for (i=0; i < zone_hash_max; ++i)
if (zone_hash[i] != &Prof_dummy)
func(zone_hash[i]);
}
static void Prof_init_lowlevel(void)
{
init_zone_hash(INIT_HASH_SIZE);
Prof_init_highlevel();
// intentionally unbalanced, this wraps everything else
{
Prof_Region(_global)
}
}

View file

@ -1,152 +0,0 @@
#ifndef INC_PROFILER_LOWLEVEL_H
#define INC_PROFILER_LOWLEVEL_H
#ifdef __cplusplus
#define Prof_C "C"
#define Prof_extern_C extern "C"
#define Prof_dummy_declare
#else
#define Prof_C
#define Prof_extern_C
#define Prof_dummy_declare int Prof_dummy_dec =
#endif
#ifdef WIN32
#include "prof_win32.h"
#else
#error "need to define Prof_get_timestamp() and Prof_Int64"
#endif
typedef struct
{
char * name;
void * highlevel;
char initialized;
char visited;
char pad0,pad1;
} Prof_Zone;
typedef struct Prof_Zone_Stack
{
Prof_Int64 t_self_start;
Prof_Int64 total_self_ticks;
Prof_Int64 total_hier_ticks;
unsigned int total_entry_count;
struct Prof_Zone_Stack * parent;
Prof_Zone * zone;
int recursion_depth;
void * highlevel;
} Prof_Zone_Stack;
extern Prof_C Prof_Zone_Stack * Prof_stack; // current Zone stack
extern Prof_C Prof_Zone_Stack Prof_dummy; // parent never matches
extern Prof_C Prof_Zone_Stack * Prof_StackAppend(Prof_Zone *zone);
// return the zone stack created by pushing 'zone' on the current
#ifdef Prof_ENABLED
static Prof_Int64 Prof_time;
#define Prof_Begin_Cache(z) \
/* declare a static cache of the zone stack */ \
static Prof_Zone_Stack *Prof_cache = &Prof_dummy
#define Prof_Begin_Raw(z) \
Prof_Begin_Cache(z); \
Prof_Begin_Code(z)
#define Prof_Begin_Code(z) \
Prof_dummy_declare ( \
\
/* check the cached Zone_Stack and update if needed */ \
(Prof_cache->parent != Prof_stack \
? Prof_cache = Prof_StackAppend(&z) \
: 0), \
\
++Prof_cache->total_entry_count, \
Prof_get_timestamp(&Prof_time), \
\
/* stop the timer on the parent zone stack */ \
(Prof_stack->total_self_ticks += \
Prof_time - Prof_stack->t_self_start), \
\
/* make cached stack current */ \
Prof_stack = Prof_cache, \
\
/* start the timer on this stack */ \
Prof_stack->t_self_start = Prof_time, \
0)
#define Prof_End_Raw() \
\
(Prof_get_timestamp(&Prof_time), \
\
/* stop timer for current zone stack */ \
Prof_stack->total_self_ticks += \
Prof_time - Prof_stack->t_self_start, \
\
/* make parent chain current */ \
Prof_stack = Prof_stack->parent, \
\
/* start timer for parent zone stack */ \
Prof_stack->t_self_start = Prof_time)
#define Prof_Declare(z) Prof_Zone Prof_region_##z
#define Prof_Define(z) Prof_Declare(z) = { #z }
#define Prof_Region(z) Prof_Begin_Raw(Prof_region_##z);
#define Prof_End Prof_End_Raw();
#define Prof_Begin(z) static Prof_Define(z); Prof_Region(z)
#define Prof_Counter(z) Prof_Begin(z) Prof_End
#ifdef __cplusplus
#define Prof(x) static Prof_Define(x); Prof_Scope(x)
#define Prof_Scope(x) \
Prof_Begin_Cache(x); \
Prof_Scope_Var Prof_scope_var(Prof_region_ ## x, Prof_cache)
struct Prof_Scope_Var {
inline Prof_Scope_Var(Prof_Zone &zone, Prof_Zone_Stack * &Prof_cache);
inline ~Prof_Scope_Var();
};
inline Prof_Scope_Var::Prof_Scope_Var(Prof_Zone &zone, Prof_Zone_Stack * &Prof_cache) {
Prof_Begin_Code(zone);
}
inline Prof_Scope_Var::~Prof_Scope_Var() {
Prof_End_Raw();
}
#endif
#else // ifdef Prof_ENABLED
#ifdef __cplusplus
#define Prof(x)
#define Prof_Scope(x)
#endif
#define Prof_Define(name)
#define Prof_Begin(z)
#define Prof_End
#define Prof_Region(z)
#define Prof_Counter(z)
#endif
#endif // INC_PROFILER_LOWLEVEL_H

View file

@ -1,53 +0,0 @@
#ifndef Prof_INC_PROF_INTERNAL_H
#define Prof_INC_PROF_INTERNAL_H
// report functions
#define NUM_VALUES 4
#define NUM_TITLE 2
#define NUM_HEADER (NUM_VALUES+1)
typedef struct {
int indent;
char *name;
int number;
char prefix;
int value_flag;
double values[NUM_VALUES];
double heat;
// used internally
void *zone;
} Prof_Report_Record;
typedef struct
{
char *title[NUM_TITLE];
char *header[NUM_HEADER];
int num_record;
int hilight;
Prof_Report_Record *record;
} Prof_Report;
extern void Prof_free_report(Prof_Report *z);
extern Prof_Report *Prof_create_report(void);
// really internal functions
extern void Prof_graph(int num_frames,
void (*callback)(int id, int x0, int x1, float *values, void *data),
void *data);
extern void Prof_init_highlevel();
extern double Prof_get_time(void);
extern int Prof_num_zones;
extern Prof_Zone *Prof_zones[];
extern Prof_Declare(_global);
#endif

View file

@ -1,774 +0,0 @@
#include <math.h>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "prof.h"
#include "prof_internal.h"
// whether zone-self-data is kept to allow the history graph
#define Prof_ZONE_HISTORY
// whether full detailed (and large)
#define Prof_CALL_HISTORY
// number of frames of history to keep
#define NUM_FRAME_SLOTS 128
// number of unique zones allowed in the entire application
// @TODO: remove MAX_PROFILING_ZONES and make it dynamic
#define MAX_PROFILING_ZONES 512
////////////////////////////////////////////////////////////////////////
// the number of moving averages
#define NUM_PROFILE_TRACKER_HISTORY_SLOTS 3
// the number of frames to ignore before starting the moving averages
#define NUM_THROWAWAY_UPDATES 3
// threshhold for a moving average of an integer to be at zero
#define INT_ZERO_THRESHHOLD 0.25
Prof_Zone *Prof_zones[MAX_PROFILING_ZONES];
#ifdef Prof_ZONE_HISTORY
static float zone_history[MAX_PROFILING_ZONES][NUM_FRAME_SLOTS]; // 256K
#endif
// these structures are used solely to track data over time
typedef struct
{
double values[NUM_PROFILE_TRACKER_HISTORY_SLOTS];
double variances[NUM_PROFILE_TRACKER_HISTORY_SLOTS];
#ifdef Prof_CALL_HISTORY
float history[NUM_FRAME_SLOTS];
#endif
} History_Scalar;
typedef struct
{
History_Scalar self_time;
History_Scalar hierarchical_time;
History_Scalar entry_count;
int max_recursion;
} Profile_Tracker_Data_Record;
static History_Scalar frame_time;
static double times_to_reach_90_percent[NUM_PROFILE_TRACKER_HISTORY_SLOTS];
static double precomputed_factors [NUM_PROFILE_TRACKER_HISTORY_SLOTS];
static int num_active_zones;
static int update_index; // 2^31 at 100fps = 280 days
static double last_update_time;
static Prof_Report_Mode displayed_quantity;
#define FRAME_TIME_INITIAL 0.001
static int history_index;
static int display_frame;
static int slot = 1;
static void clear(History_Scalar *s)
{
int i;
for (i = 0; i < NUM_PROFILE_TRACKER_HISTORY_SLOTS; i++) {
s->values[i] = 0;
s->variances[i] = 0;
}
}
static void update(History_Scalar *s, double new_value, double *k_array)
{
int i;
double new_variance = new_value * new_value;
for (i = 0; i < NUM_PROFILE_TRACKER_HISTORY_SLOTS; i++) {
double k = k_array[i];
s->values[i] = s->values[i] * k + new_value * (1 - k);
s->variances[i] = s->variances[i] * k + new_variance * (1 - k);
}
#ifdef Prof_CALL_HISTORY
s->history[history_index] = (float) new_value;
#endif
}
static void eternity_set(History_Scalar *s, double new_value)
{
double new_variance = new_value * new_value;
int i;
for (i = 0; i < NUM_PROFILE_TRACKER_HISTORY_SLOTS; i++) {
s->values[i] = new_value;
s->variances[i] = new_variance;
}
#ifdef Prof_CALL_HISTORY
s->history[history_index] = (float) new_value;
#endif
}
static double get_value(History_Scalar *s)
{
#ifdef Prof_CALL_HISTORY
if (display_frame) {
return s->history[(history_index - display_frame + NUM_FRAME_SLOTS) % NUM_FRAME_SLOTS];
}
#endif
return s->values[slot];
}
void Prof_init_highlevel()
{
int j;
update_index = 0;
last_update_time = 0;
times_to_reach_90_percent[0] = 0.1f;
times_to_reach_90_percent[1] = 0.8f;
times_to_reach_90_percent[2] = 2.5f;
displayed_quantity = Prof_SELF_TIME;
clear(&frame_time);
for (j = 0; j < NUM_PROFILE_TRACKER_HISTORY_SLOTS; j++) {
frame_time.values[j] = FRAME_TIME_INITIAL;
}
}
#ifdef Prof_ENABLED
static Prof_Zone *expand = &Prof_region__global;
#else
static Prof_Zone *expand = NULL;
#endif
Prof_extern_C void Prof_set_report_mode(Prof_Report_Mode desired)
{
displayed_quantity = desired;
}
// visit all Prof_Zone_Stacks
extern void Prof_traverse(void (*func)(Prof_Zone_Stack *c));
static void propogate_stack(Prof_Zone_Stack *c)
{
Prof_Zone_Stack *p = c;
// propogate times up the stack for hierarchical
// times, but watch out for recursion
while (p->zone) {
if (!p->zone->visited) {
p->total_hier_ticks += c->total_self_ticks;
p->zone->visited = 1;
}
p = p->parent;
}
p = c;
while (p->zone) {
p->zone->visited = 0;
p = p->parent;
}
}
static void clear_stack(Prof_Zone_Stack *c)
{
c->total_hier_ticks = 0;
c->total_self_ticks = 0;
c->total_entry_count = 0;
}
static double sum;
static void sum_times(Prof_Zone_Stack *c)
{
sum += c->total_self_ticks;
}
static double timestamps_to_seconds;
static void update_history(Prof_Zone_Stack *c)
{
double self_time, hier_time, entry_count;
Profile_Tracker_Data_Record *record = (Profile_Tracker_Data_Record *) c->highlevel;
Prof_Zone *z = c->zone;
if (record == NULL) {
record = (Profile_Tracker_Data_Record *) malloc(sizeof(*record));
c->highlevel = (void *) record;
clear(&record->entry_count);
clear(&record->self_time);
clear(&record->hierarchical_time);
record->max_recursion = 0;
}
if (c->recursion_depth > record->max_recursion)
record->max_recursion = c->recursion_depth;
self_time = c->total_self_ticks * timestamps_to_seconds;
hier_time = c->total_hier_ticks * timestamps_to_seconds;
entry_count = c->total_entry_count;
if (update_index < NUM_THROWAWAY_UPDATES) {
eternity_set(&record->entry_count, entry_count);
eternity_set(&record->self_time, self_time);
eternity_set(&record->hierarchical_time, hier_time);
} else {
update(&record->self_time, self_time, precomputed_factors);
update(&record->hierarchical_time, hier_time, precomputed_factors);
update(&record->entry_count, entry_count, precomputed_factors);
}
#ifdef Prof_ZONE_HISTORY
* ((float *) z->highlevel) += (float) self_time;
#endif
}
const double SPEEDSTEP_DETECTION_RATIO = 0.08;
static int speedstep_warning;
Prof_extern_C void Prof_update(int record_data)
{
#ifdef Prof_ENABLED
Prof_Begin(iprof_update)
static History_Scalar integer_timestamps_per_second;
static Prof_Int64 last_integer_timestamp;
static Prof_Int64 current_integer_timestamp;
int i;
double now, dt;
Prof_Int64 timestamp_delta;
double timestamps_per_second;
assert(Prof_num_zones <= MAX_PROFILING_ZONES);
Prof_traverse(propogate_stack);
// Precompute the time factors
now = Prof_get_time();
if (update_index == 0) {
dt = FRAME_TIME_INITIAL;
} else {
dt = now - last_update_time;
if (dt == 0) dt = FRAME_TIME_INITIAL;
}
last_update_time = now;
for (i = 0; i < NUM_PROFILE_TRACKER_HISTORY_SLOTS; i++) {
precomputed_factors[i] = pow(0.1f, dt / times_to_reach_90_percent[i]);
}
precomputed_factors[0] = 0; // instantaneous.
Prof_get_timestamp(&current_integer_timestamp);
if (update_index == 0) {
sum = 0;
Prof_traverse(sum_times);
if (sum == 0) sum = 1;
timestamp_delta = (Prof_Int64) sum;
} else {
timestamp_delta = current_integer_timestamp - last_integer_timestamp;
if (timestamp_delta == 0) timestamp_delta = 1;
}
last_integer_timestamp = current_integer_timestamp;
timestamps_per_second = (double) timestamp_delta / dt;
if (update_index < NUM_THROWAWAY_UPDATES) {
eternity_set(&integer_timestamps_per_second, timestamps_per_second);
} else {
update(&integer_timestamps_per_second, timestamps_per_second, precomputed_factors);
}
{
const int ss_slot = 1;
double ss_val, ss_variance, ss_stdev, ss_ratio;
ss_val = integer_timestamps_per_second.values[ss_slot];
ss_variance = integer_timestamps_per_second.variances[ss_slot] - ss_val*ss_val;
ss_stdev = sqrt(fabs(ss_variance));
ss_ratio;
if (ss_val) {
ss_ratio = ss_stdev / fabs(ss_val);
} else {
ss_ratio = 0;
}
speedstep_warning = (ss_ratio > SPEEDSTEP_DETECTION_RATIO);
}
if (!record_data) {
Prof_traverse(clear_stack);
Prof_End
return;
}
if (timestamps_per_second) {
timestamps_to_seconds = 1.0 / timestamps_per_second;
} else {
timestamps_to_seconds = 0;
}
#ifdef Prof_ZONE_HISTORY
for (i=0; i < Prof_num_zones; ++i) {
Prof_zones[i]->highlevel = (void *) &zone_history[i][history_index];
zone_history[i][history_index] = 0;
}
#endif
Prof_traverse(update_history);
update(&frame_time, dt, precomputed_factors);
++update_index;
history_index = (history_index + 1) % NUM_FRAME_SLOTS;
Prof_traverse(clear_stack);
Prof_End
#endif // Prof_ENABLED
}
static Prof_Report *allocate_buffer(int n)
{
int i;
Prof_Report *pob = (Prof_Report *) malloc(sizeof(*pob));
pob->num_record = n;
pob->record = (Prof_Report_Record *) malloc(sizeof(*pob->record) * pob->num_record);
pob->title[0] = pob->title[1] = NULL;
for (i=0; i < NUM_TITLE; ++i)
pob->title[i] = NULL;
for (i=0; i < NUM_HEADER; ++i)
pob->header[i] = NULL;
for (i=0; i < n; ++i) {
pob->record[i].values[0] = 0;
pob->record[i].values[1] = 0;
pob->record[i].values[2] = 0;
pob->record[i].values[3] = 0;
pob->record[i].value_flag = 0;
pob->record[i].heat = 0;
pob->record[i].indent = 0;
pob->record[i].number = 0;
}
return pob;
}
static int uncounted;
static Prof_Recursion_Mode recurse = Prof_FLATTEN_RECURSION;
static void propogate_to_zone(Prof_Zone_Stack *c)
{
Prof_Zone *z = c->zone;
Profile_Tracker_Data_Record *d = (Profile_Tracker_Data_Record *) c->highlevel;
Prof_Report_Record *r;
#if 1
r = (Prof_Report_Record *) z->highlevel;
#else
if (recurse == Prof_FLATTEN_RECURSION)
r = (Prof_Report_Record *) z->highlevel;
else
r = ((Prof_Report_Record **) z->highlevel)[c->recursion_depth];
#endif
if (d) {
double t;
r->values[0] += 1000 * get_value(&d->self_time);
r->values[1] += 1000 * get_value(&d->hierarchical_time);
r->values[2] += get_value(&d->entry_count);
// arbitrary determination for how low a moving average
// has to go to reach 0
if (get_value(&d->entry_count) > INT_ZERO_THRESHHOLD) {
if (d->max_recursion > r->number)
r->number = d->max_recursion;
if (c->parent->zone)
((Prof_Report_Record *) c->parent->zone->highlevel)->prefix = '+';
}
#ifdef Prof_CALL_HISTORY
if (display_frame) return; // no variances when examining history
#endif
if (displayed_quantity == Prof_HIERARCHICAL_TIME) {
t = d->hierarchical_time.variances[slot];
} else {
t = d->self_time.variances[slot];
}
t = 1000 * 1000 * t;
if (r->heat == 0)
r->heat = t;
else
r->heat = r->heat + t + 2 * sqrt(r->heat * t);
} else {
++uncounted;
}
}
static void propogate_expanded(Prof_Zone_Stack *c)
{
Profile_Tracker_Data_Record *d = (Profile_Tracker_Data_Record *) c->highlevel;
if (d == NULL) {
++uncounted;
return;
}
if (c->parent->zone && get_value(&d->entry_count) > INT_ZERO_THRESHHOLD) {
((Prof_Report_Record *) c->parent->zone->highlevel)[0].prefix = '+';
((Prof_Report_Record *) c->parent->zone->highlevel)[1].prefix = '+';
((Prof_Report_Record *) c->parent->zone->highlevel)[2].prefix = '+';
}
if (c->zone == expand) {
Prof_Report_Record *r = (Prof_Report_Record *) expand->highlevel;
// accumulate this time to ourselves
r[2].values[0] += 1000 * get_value(&d->self_time);
r[2].values[1] += 1000 * get_value(&d->hierarchical_time);
r[2].values[2] += get_value(&d->entry_count);
if (d->max_recursion > r[2].number && get_value(&d->entry_count) > INT_ZERO_THRESHHOLD)
r[2].number = d->max_recursion;
// propogate it to the parents
if (c->parent->zone) {
r = (Prof_Report_Record *) c->parent->zone->highlevel;
r[1].values[0] += 1000 * get_value(&d->self_time);
r[1].values[1] += 1000 * get_value(&d->hierarchical_time);
r[1].values[2] += get_value(&d->entry_count);
d = (Profile_Tracker_Data_Record *) c->parent->highlevel;
if (d->max_recursion > r[1].number && get_value(&d->entry_count) > INT_ZERO_THRESHHOLD)
r[1].number = d->max_recursion;
}
}
if (c->parent->zone == expand) {
Prof_Report_Record *r = (Prof_Report_Record *) c->zone->highlevel;
r[0].values[0] += 1000 * get_value(&d->self_time);
r[0].values[1] += 1000 * get_value(&d->hierarchical_time);
r[0].values[2] += get_value(&d->entry_count);
if (d->max_recursion > r[0].number && get_value(&d->entry_count) > INT_ZERO_THRESHHOLD)
r[0].number = d->max_recursion;
}
}
static double compute_heat(double variance, double value)
{
double factor, stdev;
double fabs_value = fabs(value);
const float VARIANCE_TOLERANCE_FACTOR = 0.5f;
variance = variance - value*value;
if (variance < 0) variance = 0;
stdev = sqrt(variance);
if (fabs_value < 0.000001) {
return 0;
} else {
factor = (stdev / fabs_value) * (1.0f / VARIANCE_TOLERANCE_FACTOR);
}
if (factor < 0) return 0;
if (factor > 1) return 1;
return factor;
}
static int pob_compare(const void *p, const void *q)
{
double a = ((Prof_Report_Record *) p)->values[0];
double b = ((Prof_Report_Record *) q)->values[0];
return (b < a) ? -1 : (b > a);
}
static int pob_expand_compare(const void *p, const void *q)
{
Prof_Report_Record * a = (Prof_Report_Record *) p;
Prof_Report_Record * b = (Prof_Report_Record *) q;
if (a->indent != b->indent) {
if (a->indent == 5) return -1;
if (b->indent == 5) return 1;
if (a->indent == 3) return 1;
if (b->indent == 3) return -1;
return 0;
}
if (a->values[1] == b->values[1])
return 0;
if (a->values[1] < b->values[1]) {
if (a->indent == 5) return -1;
return 1;
}
if (a->indent == 5) return 1;
return -1;
}
static int cursor;
static int update_cursor;
Prof_Report *Prof_create_report(void)
{
double avg_frame_time,fps;
char *displayed_quantity_name;
int i,s;
Prof_Report *pob;
if (displayed_quantity == Prof_CALL_GRAPH)
s = 3;
else
s = 1;
pob = allocate_buffer(Prof_num_zones * s);
for (i=0; i < Prof_num_zones; ++i) {
Prof_Zone *z = Prof_zones[i];
Prof_Report_Record *r = &pob->record[i*s];
z->highlevel = (void *) r;
if (displayed_quantity == Prof_CALL_GRAPH) {
r[0].name = r[1].name = r[2].name = z->name;
r[0].value_flag = 1 | 2 | 4;
r[1].value_flag = 1 | 2 | 4;
r[2].value_flag = 1 | 2 | 4;
r[0].indent = 3;
r[1].indent = 5;
r[2].indent = 0;
r[0].zone = r[1].zone = r[2].zone = (void *) z;
r[0].prefix = r[1].prefix = r[2].prefix = 0;
} else {
r->value_flag = 1 | 2 | 4;
r->name = z->name;
r->zone = (void *) z;
r->indent = 0;
r->prefix = 0;
}
}
avg_frame_time = frame_time.values[slot];
if (avg_frame_time == 0) avg_frame_time = 0.01f;
fps = 1.0f / avg_frame_time;
displayed_quantity_name = "*error*";
switch (displayed_quantity) {
case Prof_SELF_TIME:
displayed_quantity_name = "sort self";
break;
case Prof_HIERARCHICAL_TIME:
displayed_quantity_name = "sort hier";
break;
case Prof_CALL_GRAPH:
displayed_quantity_name = "sort hier";
break;
}
pob->title[0] = (char *) malloc(BUFSIZ);
sprintf(pob->title[0],
"%3.3lf ms/frame (fps: %3.2lf) %s",
avg_frame_time * 1000, fps, displayed_quantity_name);
#ifdef Prof_CALL_HISTORY
if (display_frame) {
sprintf(pob->title[0] + strlen(pob->title[0]), " - %d frame%s ago",
display_frame, display_frame == 1 ? "" : "s");
} else {
strcat(pob->title[0], " - current frame");
}
#endif
if (speedstep_warning)
pob->title[1] = _strdup("WARNING: SpeedStep-like timer inconsistencies detected. Results are unreliable!");
if (displayed_quantity == Prof_CALL_GRAPH) {
Prof_Report_Record *r = (Prof_Report_Record *) expand->highlevel;
int j=0;
Prof_traverse(propogate_expanded);
r[2].prefix = '-';
for (i=0; i < pob->num_record; ++i) {
if (pob->record[i].values[0] || pob->record[i].values[1] || pob->record[i].values[2]) {
pob->record[j] = pob->record[i];
++j;
}
}
pob->num_record = j;
qsort(pob->record, pob->num_record, sizeof(pob->record[0]), pob_expand_compare);
for (i=0; i < pob->num_record; ++i)
if (pob->record[i].indent == 5)
pob->record[i].indent = 3;
} else {
uncounted = 0;
Prof_traverse(propogate_to_zone);
for (i=0; i < Prof_num_zones; ++i) {
if (displayed_quantity == Prof_HIERARCHICAL_TIME) {
double t = pob->record[i].values[0];
pob->record[i].values[0] = pob->record[i].values[1];
pob->record[i].values[1] = t;
}
pob->record[i].heat = compute_heat(pob->record[i].heat, pob->record[i].values[0]);
}
qsort(pob->record, pob->num_record, sizeof(pob->record[0]), pob_compare);
}
if (update_cursor) {
for (i=0; i < pob->num_record; ++i) {
if (pob->record[i].zone == expand) {
cursor = i;
break;
}
}
update_cursor = 0;
}
pob->header[0] = _strdup("zone");
if (displayed_quantity == Prof_HIERARCHICAL_TIME) {
pob->header[1] = _strdup("hier");
pob->header[2] = _strdup("self");
} else {
pob->header[1] = _strdup("self");
pob->header[2] = _strdup("hier");
}
pob->header[3] = _strdup("count");
if (cursor < 0) cursor = 0;
if (cursor >= pob->num_record) cursor = pob->num_record-1;
pob->hilight = cursor;
return pob;
}
void Prof_free_report(Prof_Report *z)
{
int i;
for (i=0; i < NUM_TITLE; ++i)
if (z->title[i])
free(z->title[i]);
for (i=0; i < NUM_HEADER; ++i)
if (z->header[i])
free(z->header[i]);
free(z->record);
free(z);
}
Prof_extern_C void Prof_move_cursor(int num)
{
cursor += num;
}
Prof_extern_C void Prof_set_cursor(int num)
{
cursor = num;
}
Prof_extern_C void Prof_select(void)
{
Prof_Report *b = Prof_create_report();
if (b->hilight >= 0) {
void *z = b->record[b->hilight].zone;
if (z != NULL) {
expand = (Prof_Zone *) z;
displayed_quantity = Prof_CALL_GRAPH;
}
}
Prof_free_report(b);
update_cursor = 1;
}
Prof_extern_C void Prof_select_parent(void)
{
int i;
void *old = (void *) expand;
Prof_Report *b = Prof_create_report();
for (i=0; i < b->num_record; ++i) {
if (b->record[i].indent == 0) break;
if (b->record[i].zone == old) continue;
expand = (Prof_Zone *) b->record[i].zone;
}
Prof_free_report(b);
update_cursor = 1;
}
Prof_extern_C void Prof_set_frame(int num)
{
if (num < 0) num = 0;
if (num >= NUM_FRAME_SLOTS) num = NUM_FRAME_SLOTS-1;
display_frame = num;
}
Prof_extern_C void Prof_move_frame(int delta)
{
// convert so negative delta = "into the past"
Prof_set_frame(display_frame - delta);
}
Prof_extern_C void Prof_set_smoothing(int x)
{
if (x <= 0) x = 0;
if (x >= NUM_PROFILE_TRACKER_HISTORY_SLOTS)
x = NUM_PROFILE_TRACKER_HISTORY_SLOTS-1;
slot = x;
}
// currently does nothing
Prof_extern_C void Prof_set_recursion(Prof_Recursion_Mode e)
{
recurse = e;
}
static int id(Prof_Zone *z)
{
// hash the string so that the id is consistent from
// run to run (rather than using the pointer itself which isn't)
// @TODO: only compute this at zone init time?
unsigned int h = 0x55555555;
char *n = z->name;
while (*n)
h = (h << 5) + (h >> 27) + *n++;
return h;
}
void Prof_graph(int num_frames, void (*callback)(int id, int x0, int x1, float *values, void *data), void *data)
{
#ifdef Prof_ZONE_HISTORY
int i,h = history_index;
if (num_frames > NUM_FRAME_SLOTS)
num_frames = NUM_FRAME_SLOTS;
for (i=0; i < Prof_num_zones; ++i) {
if (h >= num_frames) {
callback(id(Prof_zones[i]), 0, num_frames, &zone_history[i][h-num_frames], data);
} else {
callback(id(Prof_zones[i]), num_frames - h, num_frames, &zone_history[i][0], data);
callback(id(Prof_zones[i]), 0, num_frames-h, &zone_history[i][NUM_FRAME_SLOTS-(num_frames-h)], data);
}
}
// display frame "cursor"
if (display_frame != 0) {
float value[2] = { 2.0, 0 };
callback(0, NUM_FRAME_SLOTS-1-display_frame, NUM_FRAME_SLOTS-1-display_frame, value, data);
}
#endif
}

View file

@ -1,21 +0,0 @@
#define WIN32_LEAN_AND_MEAN
#define WIN32_EXTRA_LEAN
#include <windows.h>
#include <assert.h>
double Prof_get_time(void)
{
LARGE_INTEGER freq;
LARGE_INTEGER time;
BOOL ok = QueryPerformanceFrequency(&freq);
assert(ok == TRUE);
freq.QuadPart = freq.QuadPart;
ok = QueryPerformanceCounter(&time);
assert(ok == TRUE);
return time.QuadPart / (double) freq.QuadPart;
}

View file

@ -1,24 +0,0 @@
#ifndef Prof_INC_PROF_WIN32_H
#define Prof_INC_PROF_WIN32_H
typedef __int64 Prof_Int64;
#ifdef __cplusplus
inline
#elif _MSC_VER >= 1200
__forceinline
#else
static
#endif
void Prof_get_timestamp(Prof_Int64 *result)
{
__asm {
rdtsc;
mov ebx, result
mov [ebx], eax
mov [ebx+4], edx
}
}
#endif