From 9c669480a6fec214dced892ef4f71b69c8d597e8 Mon Sep 17 00:00:00 2001
From: King_DuckZ <king_duckz@gmx.com>
Date: Fri, 20 Jun 2014 19:04:39 +0200
Subject: [PATCH] Some documentation about fast character counting.

---
 ...-characters-in-utf8-strings-is-faster.html | 333 ++++++++++++++++++
 doc/references/faster-utf8-strlen.html        | 269 ++++++++++++++
 2 files changed, 602 insertions(+)
 create mode 100644 doc/references/counting-characters-in-utf8-strings-is-faster.html
 create mode 100644 doc/references/faster-utf8-strlen.html
diff --git a/doc/references/counting-characters-in-utf8-strings-is-faster.html b/doc/references/counting-characters-in-utf8-strings-is-faster.html
new file mode 100644
index 0000000..8658f7a
--- /dev/null
+++ b/doc/references/counting-characters-in-utf8-strings-is-faster.html
@@ -0,0 +1,333 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
+<title>Counting Characters in UTF-8 Strings Is Fast(er) : porges</title>
+<!--[if gte IE 7]><!-->
+<link rel="stylesheet" href="http://porg.es/blog/wp-content/themes/manifest_v1.1/style.css" type="text/css" media="screen" charset="utf-8"/>
+<!-- <![endif]-->
+<!--[if IE 7]>
+  <link rel="stylesheet" href="http://porg.es/blog/wp-content/themes/manifest_v1.1/style_ie.css" type="text/css" media="screen" charset="utf-8" />
+<![endif]-->
+<!--[if IE 6]>
+<link rel="stylesheet" type="text/css" media="screen" href="http://porg.es/blog/wp-content/themes/manifest_v1.1/styles_ie6.css" />
+<![endif]-->
+<link rel="alternate" type="application/rss+xml" title="porges RSS Feed" href="http://porg.es/blog/feed"/>
+<link rel="alternate" type="application/atom+xml" title="porges Atom Feed" href="http://porg.es/blog/feed/atom"/>
+<script type="text/javascript" src="http://porg.es/blog/wp-content/themes/manifest_v1.1/js/jquery-1.3.2.min.js.pagespeed.jm.eWR5IUrZrf.js" charset="utf-8"></script>
+<link rel="pingback" href="http://porg.es/blog/xmlrpc.php"/>
+<link rel="alternate" type="application/rss+xml" title="porges &raquo; Counting Characters in UTF-8 Strings Is Fast(er) Comments Feed" href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster/feed"/>
+<link rel='stylesheet' id='wp-syntax-css-css' href='http://porg.es/blog/wp-content/plugins/wp-syntax/css/A.wp-syntax.css,qver=1.0.pagespeed.cf.S1RLOpa5BE.css' type='text/css' media='all'/>
+<script type='text/javascript'>//<![CDATA[
+var addComment={moveForm:function(a,b,c,d){var e,f=this,g=f.I(a),h=f.I(c),i=f.I("cancel-comment-reply-link"),j=f.I("comment_parent"),k=f.I("comment_post_ID");if(g&&h&&i&&j){f.respondId=c,d=d||!1,f.I("wp-temp-form-div")||(e=document.createElement("div"),e.id="wp-temp-form-div",e.style.display="none",h.parentNode.insertBefore(e,h)),g.parentNode.insertBefore(h,g.nextSibling),k&&d&&(k.value=d),j.value=b,i.style.display="",i.onclick=function(){var a=addComment,b=a.I("wp-temp-form-div"),c=a.I(a.respondId);if(b&&c)return a.I("comment_parent").value="0",b.parentNode.insertBefore(c,b),b.parentNode.removeChild(b),this.style.display="none",this.onclick=null,!1};try{f.I("comment").focus()}catch(l){}return!1}},I:function(a){return document.getElementById(a)}};
+//]]></script>
+<link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://porg.es/blog/xmlrpc.php?rsd"/>
+<link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://porg.es/blog/wp-includes/wlwmanifest.xml"/>
+<link rel='prev' title='477211307' href='http://porg.es/blog/477211307'/>
+<link rel='next' title='Ridiculous UTF-8 character counting' href='http://porg.es/blog/ridiculous-utf-8-character-counting'/>
+<meta name="generator" content="WordPress 3.8.3"/>
+<link rel='canonical' href='http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster'/>
+<link rel='shortlink' href='http://porg.es/blog/?p=130'/>
+<!-- analytics tracking -->
+<script type="text/javascript">var _gaq=_gaq||[];_gaq.push(['_setAccount','UA-1899042-1']);_gaq.push(['_trackPageview']);(function(){var ga=document.createElement('script');ga.type='text/javascript';ga.async=true;ga.src=('https:'==document.location.protocol?'https://ssl':'http://www')+'.google-analytics.com/ga.js';var s=document.getElementsByTagName('script')[0];s.parentNode.insertBefore(ga,s);})();</script>
+</head>
+<body class="single single-post postid-130 single-format-standard">
+<div id="siteWrapper">
+<h1 class="vcard author"><a href="http://porg.es/blog/" title="Home" class="fn">porges</a></h1>
+<div id="mainNav">
+<ul>
+<li class="page_item page-item-105"><a href="http://porg.es/blog/about-me">About Me</a></li>
+</ul>
+</div>
+<div id="siteDescription">
+</div>
+<div id="coreContent">
+<div class="post single hentry">
+<div class="postContent">
+<h3 class="entry-title">Counting Characters in UTF-8 Strings Is Fast(er)</h3>
+<h4 class="vcard author">by <span class="fn">George Pollard</span></h4>
+<div class="entry-content">
+<p>‘<a href="http://canonical.org/~kragen/strlen-utf8.html">Counting Characters in UTF-8 Strings Is Fast</a>’ by Kragen Sitaker shows several ways to count characters UTF-8, using both assembly and C. But, with a few assumptions, we can go faster.</p>
+<h3>Assumption One: We are dealing with a valid UTF-8 string</h3>
+<p>Making this assumption means that once we hit the start of a multi-byte character we can skip forward a few places. It also means we don&#8217;t check for hitting invalid characters (<s>this sends the algorithm into an infinite loop if run on non-valid input</s> it is possible to make the algorithm run past the end of the buffer by supplying malformed data).</p>
+<h3>Assumption Two: Most strings are ASCII</h3>
+<p>Therefore, run a simple ASCII count routine beforehand. As soon as we hit a non-ASCII character switch into counting UTF-8.</p>
+<h3>The code</h3>
+<p>Note: The current code relies on chars being signed bytes.</p>
+<div class="wp_syntax"><table><tr><td class="code"><pre class="c" style="font-family:monospace;"><span style="color: #993333;">int</span> porges_strlen2<span style="color: #009900;">&#40;</span><span style="color: #993333;">char</span> <span style="color: #339933;">*</span>s<span style="color: #009900;">&#41;</span>
+<span style="color: #009900;">&#123;</span>
+        <span style="color: #993333;">int</span> i <span style="color: #339933;">=</span> <span style="color: #0000dd;">0</span><span style="color: #339933;">;</span>
+&nbsp;
+        <span style="color: #666666; font-style: italic;">//Go fast if string is only ASCII.</span>
+        <span style="color: #666666; font-style: italic;">//Loop while not at end of string,</span>
+        <span style="color: #666666; font-style: italic;">// and not reading anything with highest bit set.</span>
+        <span style="color: #666666; font-style: italic;">//If highest bit is set, number is negative.</span>
+        <span style="color: #b1b100;">while</span> <span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span> <span style="color: #339933;">&gt;</span> <span style="color: #0000dd;">0</span><span style="color: #009900;">&#41;</span>
+                i<span style="color: #339933;">++;</span>
+&nbsp;
+        <span style="color: #b1b100;">if</span> <span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span> <span style="color: #339933;">&lt;=</span> <span style="color: #339933;">-</span><span style="color: #0000dd;">65</span><span style="color: #009900;">&#41;</span> <span style="color: #666666; font-style: italic;">// all follower bytes have values below -65</span>
+                <span style="color: #b1b100;">return</span> <span style="color: #339933;">-</span><span style="color: #0000dd;">1</span><span style="color: #339933;">;</span> <span style="color: #666666; font-style: italic;">// invalid</span>
+&nbsp;
+        <span style="color: #666666; font-style: italic;">//Note, however, that the following code does *not*</span>
+        <span style="color: #666666; font-style: italic;">// check for invalid characters.</span>
+        <span style="color: #666666; font-style: italic;">//The above is just included to bail out on the tests :)</span>
+&nbsp;
+        <span style="color: #993333;">int</span> count <span style="color: #339933;">=</span> i<span style="color: #339933;">;</span>
+        <span style="color: #b1b100;">while</span> <span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span><span style="color: #009900;">&#41;</span>
+        <span style="color: #009900;">&#123;</span>
+                <span style="color: #666666; font-style: italic;">//if ASCII just go to next character</span>
+                <span style="color: #b1b100;">if</span> <span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span> <span style="color: #339933;">&gt;</span> <span style="color: #0000dd;">0</span><span style="color: #009900;">&#41;</span>      i <span style="color: #339933;">+=</span> <span style="color: #0000dd;">1</span><span style="color: #339933;">;</span>
+                <span style="color: #b1b100;">else</span>
+                <span style="color: #666666; font-style: italic;">//select amongst multi-byte starters</span>
+                <span style="color: #b1b100;">switch</span> <span style="color: #009900;">&#40;</span><span style="color: #208080;">0xF0</span> <span style="color: #339933;">&amp;</span> s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span><span style="color: #009900;">&#41;</span>
+                <span style="color: #009900;">&#123;</span>
+                        <span style="color: #b1b100;">case</span> <span style="color: #208080;">0xE0</span><span style="color: #339933;">:</span> i <span style="color: #339933;">+=</span> <span style="color: #0000dd;">3</span><span style="color: #339933;">;</span> <span style="color: #000000; font-weight: bold;">break</span><span style="color: #339933;">;</span>
+                        <span style="color: #b1b100;">case</span> <span style="color: #208080;">0xF0</span><span style="color: #339933;">:</span> i <span style="color: #339933;">+=</span> <span style="color: #0000dd;">4</span><span style="color: #339933;">;</span> <span style="color: #000000; font-weight: bold;">break</span><span style="color: #339933;">;</span>
+                        <span style="color: #b1b100;">default</span><span style="color: #339933;">:</span>   i <span style="color: #339933;">+=</span> <span style="color: #0000dd;">2</span><span style="color: #339933;">;</span> <span style="color: #000000; font-weight: bold;">break</span><span style="color: #339933;">;</span>
+                <span style="color: #009900;">&#125;</span>
+                <span style="color: #339933;">++</span>count<span style="color: #339933;">;</span>
+        <span style="color: #009900;">&#125;</span>
+        <span style="color: #b1b100;">return</span> count<span style="color: #339933;">;</span>
+<span style="color: #009900;">&#125;</span></pre></td></tr></table></div>
+<h3>Results</h3>
+<p>I used Kragen’s testing code, but removed all <code>strlen</code>s that didn’t do UTF-8 counting, and added one test for valid UTF-8 text (just the phrase ‘こんにちは’ repeated). Twice as fast on both the ASCII-only and UTF-8 tests. Improvement on ASCII is due to the ASCII-only routine, and improvement on UTF-8 is due to skipping bytes.</p>
+<pre><code>"": 0 0 0 0 0
+"hello, world": 12 12 12 12 12
+"naïve": 5 5 5 5 5
+"こんにちは": 5 5 5 5 5
+1: all 'a':
+1:           porges_strlen2(string) =   33554431: 0.034672
+1:         ap_strlen_utf8_s(string) =   33554431: 0.068210
+1:         my_strlen_utf8_c(string) =   33554431: 0.071038
+1:         my_strlen_utf8_s(string) =   33554431: 0.135856
+2: all '\xe3':
+2:           porges_strlen2(string) =   11184811: 0.032115
+2:         ap_strlen_utf8_s(string) =   33554431: 0.068228
+2:         my_strlen_utf8_c(string) =   33554431: 0.071050
+2:         my_strlen_utf8_s(string) =   33554431: 0.152513
+3: all '\x81':
+3:           porges_strlen2(string) =         -1: 0.000001
+3:         my_strlen_utf8_s(string) =          0: 0.068339
+3:         ap_strlen_utf8_s(string) =          0: 0.068547
+3:         my_strlen_utf8_c(string) =          0: 0.071039
+4: all konichiwa:
+4:           porges_strlen2(string) =   11184810: 0.032143
+4:         ap_strlen_utf8_s(string) =   11184810: 0.068271
+4:         my_strlen_utf8_c(string) =   11184810: 0.071036
+4:         my_strlen_utf8_s(string) =   11184810: 0.089478
+</code></pre>
+<p>Note also that the invalid UTF-8 gives strange results; this is because the algorithm isn’t meant to work on it! (The first invalid sequence is a list of 3-byte starters, so the result is divided in 3 due to skipping, and the second is a list of follower bytes, so the code bails out.)</p>
+<h3>Going faster</h3>
+<p>By dropping back to the ASCII counter whenever we hit ASCII again, we go even faster. This will handle the cases (such as in English) where there are many ASCII characters and only a few multibyte ones.</p>
+<div class="wp_syntax"><table><tr><td class="code"><pre class="c" style="font-family:monospace;"><span style="color: #993333;">int</span> porges_strlen2<span style="color: #009900;">&#40;</span><span style="color: #993333;">char</span> <span style="color: #339933;">*</span>s<span style="color: #009900;">&#41;</span>
+<span style="color: #009900;">&#123;</span>
+        <span style="color: #993333;">int</span> i <span style="color: #339933;">=</span> <span style="color: #0000dd;">0</span><span style="color: #339933;">;</span>
+        <span style="color: #993333;">int</span> iBefore <span style="color: #339933;">=</span> <span style="color: #0000dd;">0</span><span style="color: #339933;">;</span>
+        <span style="color: #993333;">int</span> count <span style="color: #339933;">=</span> <span style="color: #0000dd;">0</span><span style="color: #339933;">;</span>
+&nbsp;
+        <span style="color: #b1b100;">while</span> <span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span> <span style="color: #339933;">&gt;</span> <span style="color: #0000dd;">0</span><span style="color: #009900;">&#41;</span>
+                ascii<span style="color: #339933;">:</span>  i<span style="color: #339933;">++;</span>
+&nbsp;
+        count <span style="color: #339933;">+=</span> i<span style="color: #339933;">-</span>iBefore<span style="color: #339933;">;</span>
+        <span style="color: #b1b100;">while</span> <span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span><span style="color: #009900;">&#41;</span>
+        <span style="color: #009900;">&#123;</span>
+                <span style="color: #b1b100;">if</span> <span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span> <span style="color: #339933;">&gt;</span> <span style="color: #0000dd;">0</span><span style="color: #009900;">&#41;</span>
+                <span style="color: #009900;">&#123;</span>
+                        iBefore <span style="color: #339933;">=</span> i<span style="color: #339933;">;</span>
+                        <span style="color: #b1b100;">goto</span> ascii<span style="color: #339933;">;</span>
+                <span style="color: #009900;">&#125;</span>
+                <span style="color: #b1b100;">else</span>
+                <span style="color: #b1b100;">switch</span> <span style="color: #009900;">&#40;</span><span style="color: #208080;">0xF0</span> <span style="color: #339933;">&amp;</span> s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span><span style="color: #009900;">&#41;</span>
+                <span style="color: #009900;">&#123;</span>
+                        <span style="color: #b1b100;">case</span> <span style="color: #208080;">0xE0</span><span style="color: #339933;">:</span> i <span style="color: #339933;">+=</span> <span style="color: #0000dd;">3</span><span style="color: #339933;">;</span> <span style="color: #000000; font-weight: bold;">break</span><span style="color: #339933;">;</span>
+                        <span style="color: #b1b100;">case</span> <span style="color: #208080;">0xF0</span><span style="color: #339933;">:</span> i <span style="color: #339933;">+=</span> <span style="color: #0000dd;">4</span><span style="color: #339933;">;</span> <span style="color: #000000; font-weight: bold;">break</span><span style="color: #339933;">;</span>
+                        <span style="color: #b1b100;">default</span><span style="color: #339933;">:</span>   i <span style="color: #339933;">+=</span> <span style="color: #0000dd;">2</span><span style="color: #339933;">;</span> <span style="color: #000000; font-weight: bold;">break</span><span style="color: #339933;">;</span>
+                <span style="color: #009900;">&#125;</span>
+                <span style="color: #339933;">++</span>count<span style="color: #339933;">;</span>
+        <span style="color: #009900;">&#125;</span>
+        <span style="color: #b1b100;">return</span> count<span style="color: #339933;">;</span>
+<span style="color: #009900;">&#125;</span></pre></td></tr></table></div>
+<p>But on the ‘konichiwa’ test the speed improvement happens even though we’re counting pure multibyte, and I’m not sure exactly why&#8230; probably something to do with branch prediction or another arcane CPU topic I don’t understand. <img src="http://porg.es/blog/wp-content/plugins/wp-smiley-switcher/noktahhitam/xicon_smile.gif.pagespeed.ic.QDVWtG6ocf.png" alt="" pagespeed_url_hash="666880254"/></p>
+<pre><code>4: all konichiwa:
+4:           porges_strlen2(string) =   11184810: 0.026017
+4:         ap_strlen_utf8_s(string) =   11184810: 0.068320
+4:         my_strlen_utf8_c(string) =   11184810: 0.071035
+4:         my_strlen_utf8_s(string) =   11184810: 0.089464
+5: mixed english:
+5:           porges_strlen2(string) =   32435949: 0.040342
+5:         my_strlen_utf8_c(string) =   32435949: 0.071035
+5:         ap_strlen_utf8_s(string) =   32435949: 0.078233
+5:         my_strlen_utf8_s(string) =   32435949: 0.160676</code></pre>
+<p>Without the drop-back-to-ASCII modification:</p>
+<pre><code>5: mixed english:
+5:           porges_strlen2(string) =   32435949: 0.067753</code></pre>
+</div>
+</div>
+<div class="postMeta">
+<div class="postDate"><span>Published:</span> <abbr class="published" title="2008-06-04T17:34:57+0000"><a href="http://porg.es/blog/2008/06/04">June 4, 2008</a></abbr></div>
+<div class="categories"><span>Filed Under:</span> <a href="http://porg.es/blog/category/code" title="View all posts in code" rel="category tag">code</a></div>
+<span>Tags:</span> <a href="http://porg.es/blog/tag/c" rel="tag">C</a> : <a href="http://porg.es/blog/tag/code" rel="tag">code</a> : <a href="http://porg.es/blog/tag/fast" rel="tag">fast</a> : <a href="http://porg.es/blog/tag/speed" rel="tag">speed</a> : <a href="http://porg.es/blog/tag/strings" rel="tag">strings</a> : <a href="http://porg.es/blog/tag/strlen" rel="tag">strlen</a> : <a href="http://porg.es/blog/tag/utf8" rel="tag">utf8</a>
+</div>
+</div>
+<!-- You can start editing here. -->
+<div id="comments">
+<h3 id="comments">6 Responses to &#8220;Counting Characters in UTF-8 Strings Is Fast(er)&#8221;</h3>
+<div class="navigation">
+<div class="alignleft"></div>
+<div class="alignright"></div>
+</div>
+<ol class="commentlist">
+<li class="comment even thread-even depth-1" id="comment-71725">
+<div id="div-comment-71725" class="comment-body">
+<div class="comment-author vcard">
+<img alt='' src='http://0.gravatar.com/avatar/67d17db3077b5f34fa8798445546d86a?s=48&amp;d=http%3A%2F%2F0.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D48&amp;r=X' class='avatar avatar-48 photo' height='48' width='48' pagespeed_url_hash="1353091553"/>	<cite class="fn">matthew</cite> <span class="says">says:</span>	</div>
+<div class="comment-meta commentmetadata"><a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster/comment-page-1#comment-71725">
+June 4, 2008 at 7:15 pm</a>	</div>
+<p>BTW, his name is Kragen, not Ragen.</p>
+<div class="reply">
+</div>
+</div>
+</li><!-- #comment-## -->
+<li class="comment byuser comment-author-administrator bypostauthor odd alt thread-odd thread-alt depth-1" id="comment-71727">
+<div id="div-comment-71727" class="comment-body">
+<div class="comment-author vcard">
+<img alt='' src='http://0.gravatar.com/avatar/4ee697b9dc2411d50fb66a4f330ba4df?s=48&amp;d=http%3A%2F%2F0.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D48&amp;r=X' class='avatar avatar-48 photo' height='48' width='48' pagespeed_url_hash="152127690"/>	<cite class="fn"><a href='http://porg.es/blog/' rel='external' class='url'>Porges</a></cite> <span class="says">says:</span>	</div>
+<div class="comment-meta commentmetadata"><a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster/comment-page-1#comment-71727">
+June 4, 2008 at 7:55 pm</a>	</div>
+<p>Whoops <img src="http://porg.es/blog/wp-content/plugins/wp-smiley-switcher/noktahhitam/xicon_smile.gif.pagespeed.ic.QDVWtG6ocf.png" alt="" pagespeed_url_hash="666880254"/></p>
+<p>I think the URL must have tripped me up; I’m so used to Bob Smith being /~bsmith/&#8230;</p>
+<div class="reply">
+</div>
+</div>
+</li><!-- #comment-## -->
+<li class="comment even thread-even depth-1" id="comment-71757">
+<div id="div-comment-71757" class="comment-body">
+<div class="comment-author vcard">
+<img alt='' src='http://0.gravatar.com/avatar/23fa05e98b38b413b8cab2897f084621?s=48&amp;d=http%3A%2F%2F0.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D48&amp;r=X' class='avatar avatar-48 photo' height='48' width='48' pagespeed_url_hash="2557715616"/>	<cite class="fn">Savvu</cite> <span class="says">says:</span>	</div>
+<div class="comment-meta commentmetadata"><a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster/comment-page-1#comment-71757">
+June 5, 2008 at 2:00 am</a>	</div>
+<p>while(*s) cnt += tbl[*s++ &gt;&gt; 4]; return cnt;</p>
+<p>Setting up tbl is left as an excercise to the reader. If your chars are signed you also need an AND mask.</p>
+<div class="reply">
+</div>
+</div>
+</li><!-- #comment-## -->
+<li class="comment byuser comment-author-administrator bypostauthor odd alt thread-odd thread-alt depth-1" id="comment-71817">
+<div id="div-comment-71817" class="comment-body">
+<div class="comment-author vcard">
+<img alt='' src='http://0.gravatar.com/avatar/4ee697b9dc2411d50fb66a4f330ba4df?s=48&amp;d=http%3A%2F%2F0.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D48&amp;r=X' class='avatar avatar-48 photo' height='48' width='48' pagespeed_url_hash="152127690"/>	<cite class="fn"><a href='http://porg.es/blog/' rel='external' class='url'>Porges</a></cite> <span class="says">says:</span>	</div>
+<div class="comment-meta commentmetadata"><a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster/comment-page-1#comment-71817">
+June 5, 2008 at 12:47 pm</a>	</div>
+<p>Hi Savvu, I implemented this as:</p>
+<div class="wp_syntax"><table><tr><td class="code"><pre class="c" style="font-family:monospace;"><span style="color: #993333;">int</span> tbl<span style="color: #009900;">&#91;</span><span style="color: #009900;">&#93;</span> <span style="color: #339933;">=</span> <span style="color: #009900;">&#123;</span>
+    <span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">0</span><span style="color: #339933;">,</span><span style="color: #0000dd;">0</span><span style="color: #339933;">,</span><span style="color: #0000dd;">0</span><span style="color: #339933;">,</span><span style="color: #0000dd;">0</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span>
+<span style="color: #009900;">&#125;</span><span style="color: #339933;">;</span>
+&nbsp;
+<span style="color: #993333;">int</span> savvu_strlen<span style="color: #009900;">&#40;</span><span style="color: #993333;">char</span> <span style="color: #339933;">*</span>s<span style="color: #009900;">&#41;</span>
+<span style="color: #009900;">&#123;</span> 
+    <span style="color: #993333;">int</span> cnt <span style="color: #339933;">=</span> <span style="color: #0000dd;">0</span><span style="color: #339933;">;</span>
+    <span style="color: #b1b100;">while</span><span style="color: #009900;">&#40;</span><span style="color: #339933;">*</span>s<span style="color: #009900;">&#41;</span> cnt <span style="color: #339933;">+=</span> tbl<span style="color: #009900;">&#91;</span><span style="color: #009900;">&#40;</span><span style="color: #339933;">*</span>s<span style="color: #339933;">++</span> <span style="color: #339933;">&gt;&gt;</span> <span style="color: #0000dd;">4</span><span style="color: #009900;">&#41;</span> <span style="color: #339933;">&amp;</span> <span style="color: #208080;">0xF</span><span style="color: #009900;">&#93;</span><span style="color: #339933;">;</span>
+    <span style="color: #b1b100;">return</span> cnt<span style="color: #339933;">;</span>
+<span style="color: #009900;">&#125;</span></pre></td></tr></table></div>
+<p>It is consistently the slowest or second-to slowest.</p>
+<p>I tried implementing it with byte-skipping:</p>
+<div class="wp_syntax"><table><tr><td class="code"><pre class="c" style="font-family:monospace;"><span style="color: #993333;">int</span> tbl<span style="color: #009900;">&#91;</span><span style="color: #009900;">&#93;</span> <span style="color: #339933;">=</span> <span style="color: #009900;">&#123;</span>
+<span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span> <span style="color: #666666; font-style: italic;">//one-byte</span>
+<span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span> <span style="color: #666666; font-style: italic;">//invalid, but don't go into infinite loop</span>
+<span style="color: #0000dd;">2</span><span style="color: #339933;">,</span><span style="color: #0000dd;">2</span><span style="color: #339933;">,</span> <span style="color: #666666; font-style: italic;">//two-byte starter</span>
+<span style="color: #0000dd;">3</span><span style="color: #339933;">,</span> <span style="color: #666666; font-style: italic;">//three-byte starter</span>
+<span style="color: #0000dd;">4</span> <span style="color: #666666; font-style: italic;">//four-byte starter</span>
+<span style="color: #009900;">&#125;</span><span style="color: #339933;">;</span>
+&nbsp;
+<span style="color: #993333;">int</span> porges_strlen<span style="color: #009900;">&#40;</span><span style="color: #993333;">char</span> <span style="color: #339933;">*</span>s<span style="color: #009900;">&#41;</span>
+<span style="color: #009900;">&#123;</span>
+        <span style="color: #993333;">int</span> cnt <span style="color: #339933;">=</span> <span style="color: #0000dd;">0</span><span style="color: #339933;">;</span>
+        <span style="color: #993333;">int</span> i <span style="color: #339933;">=</span> <span style="color: #0000dd;">0</span><span style="color: #339933;">;</span>
+        <span style="color: #b1b100;">while</span><span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span><span style="color: #009900;">&#41;</span> <span style="color: #009900;">&#123;</span> i <span style="color: #339933;">+=</span> tbl<span style="color: #009900;">&#91;</span><span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span> <span style="color: #339933;">&gt;&gt;</span> <span style="color: #0000dd;">4</span><span style="color: #009900;">&#41;</span> <span style="color: #339933;">&amp;</span> <span style="color: #208080;">0x0f</span><span style="color: #009900;">&#93;</span><span style="color: #339933;">;</span> <span style="color: #339933;">++</span>cnt<span style="color: #339933;">;</span> <span style="color: #009900;">&#125;</span>
+        <span style="color: #b1b100;">return</span> cnt<span style="color: #339933;">;</span>
+<span style="color: #009900;">&#125;</span></pre></td></tr></table></div>
+<p>This version is only faster on the byte-skipping tests, and is still about half the speed of what I posted.</p>
+<div class="reply">
+</div>
+</div>
+</li><!-- #comment-## -->
+<li class="comment even thread-even depth-1" id="comment-71853">
+<div id="div-comment-71853" class="comment-body">
+<div class="comment-author vcard">
+<img alt='' src='http://1.gravatar.com/avatar/b86a032cdc22e3758a45d4e411eb9782?s=48&amp;d=http%3A%2F%2F1.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D48&amp;r=X' class='avatar avatar-48 photo' height='48' width='48' pagespeed_url_hash="2598894441"/>	<cite class="fn"><a href='http://www.daemonology.net/blog/' rel='external' class='url'>Colin Percival</a></cite> <span class="says">says:</span>	</div>
+<div class="comment-meta commentmetadata"><a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster/comment-page-1#comment-71853">
+June 5, 2008 at 9:24 pm</a>	</div>
+<p>I&#8217;ve done even better. <img src="http://porg.es/blog/wp-content/plugins/wp-smiley-switcher/noktahhitam/xicon_smile.gif.pagespeed.ic.QDVWtG6ocf.png" alt="" pagespeed_url_hash="666880254"/></p>
+<p>Vectorization yields a 2-4x speedup over your code: <a href="http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html">http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html</a></p>
+<div class="reply">
+</div>
+</div>
+</li><!-- #comment-## -->
+<li class="pingback odd alt thread-odd thread-alt depth-1" id="comment-85580">
+<div id="div-comment-85580" class="comment-body">
+<div class="comment-author vcard">
+<cite class="fn"><a href='http://initiative.yo2.cn/archives/634387' rel='external' class='url'>几个汇编/C高性能处理UTF-8的帖子</a></cite> <span class="says">says:</span>	</div>
+<div class="comment-meta commentmetadata"><a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster/comment-page-1#comment-85580">
+November 17, 2008 at 11:40 pm</a>	</div>
+<p>[...] COUNTING CHARACTERS IN UTF-8 STRINGS IS FAST(ER) <a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster">http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster</a> [...]</p>
+<div class="reply">
+</div>
+</div>
+</li><!-- #comment-## -->
+</ol>
+<div class="navigation">
+<div class="alignleft"></div>
+<div class="alignright"></div>
+</div>
+<div id="respond">
+<form action="http://porg.es/blog/wp-comments-post.php" method="post" id="commentform">
+<div class="leaveComment">
+<fieldset>
+<legend><span>Leave a Comment</span></legend>
+<div class="commentForm">
+<div class="commentAuthorInfo">
+<label>Name: <em>Required</em> <input type="text" name="author" id="author" value=""/></label>
+<label>Email: <em>Required, not published</em> <input type="text" name="email" id="email" value=""/></label>
+<label>Homepage: <input type="text" name="url" id="url" value=""/></label>
+</div>
+<label>Comment:
+<textarea name="comment" id="comment" cols="50" rows="20"></textarea></label>
+<div id="cancel-comment-reply">
+<small><a rel="nofollow" id="cancel-comment-reply-link" href="/blog/counting-characters-in-utf-8-strings-is-faster#respond" style="display:none;">Cancel Reply</a></small>
+</div>
+<input type="submit" value="Post Comment"/> <input type="hidden" name="comment_post_ID" value="130"/>
+</div>
+</fieldset>
+</div>
+<input type='hidden' name='comment_post_ID' value='130' id='comment_post_ID'/>
+<input type='hidden' name='comment_parent' id='comment_parent' value='0'/>
+<p style="display: none;"><input type="hidden" id="akismet_comment_nonce" name="akismet_comment_nonce" value="65ee253bdd"/></p>
+</form>
+</div>
+</div>
+<div class="pageNav">
+<div class="prev"><a href="http://porg.es/blog/477211307" rel="prev">&laquo; Previous Post</a></div>
+<div class="next"><a href="http://porg.es/blog/ridiculous-utf-8-character-counting" rel="next">Next Post &raquo;</a></div>
+</div>
+</div>
+</div>
+<div id="footer">
+<!-- Footer Links -->
+<h5>Elsewhere</h5>
+<ul class="elsewhere">
+</ul>
+<!-- Search Field -->
+<div class="footerContent">
+<form method="get" id="searchform" action="http://porg.es/blog/">
+<div id="search">
+<input type="text" value="" name="s" id="s"/>
+<input type="submit" id="searchsubmit" value="Search"/>
+</div>
+</form>
+<p>&copy; porges. Powered by <a href="http://wordpress.org/">WordPress</a> and <a href="http://jimbarraud.com/manifest/">Manifest</a></p>
+</div>
+</div>
+<script type="text/javascript">//<![CDATA[
+(function(){var d=encodeURIComponent,f=window,g=document,h="documentElement",k="length",l="prototype",m="body",p="&",s="&ci=",t=",",u="?",v="Content-Type",w="Microsoft.XMLHTTP",x="Msxml2.XMLHTTP",y="POST",z="application/x-www-form-urlencoded",A="img",B="input",C="load",D="oh=",E="on",F="pagespeed_url_hash",G="url=";f.pagespeed=f.pagespeed||{};var H=f.pagespeed,I=function(a,b,c){this.c=a;this.e=b;this.d=c;this.b=this.f();this.a={}};I[l].f=function(){return{height:f.innerHeight||g[h].clientHeight||g[m].clientHeight,width:f.innerWidth||g[h].clientWidth||g[m].clientWidth}};I[l].g=function(a){a=a.getBoundingClientRect();return{top:a.top+(void 0!==f.pageYOffset?f.pageYOffset:(g[h]||g[m].parentNode||g[m]).scrollTop),left:a.left+(void 0!==f.pageXOffset?f.pageXOffset:(g[h]||g[m].parentNode||g[m]).scrollLeft)}};I[l].h=function(a){if(0>=a.offsetWidth&&0>=a.offsetHeight)return!1;a=this.g(a);var b=a.top.toString()+t+a.left.toString();if(this.a.hasOwnProperty(b))return!1;this.a[b]=!0;return a.top<=this.b.height&&a.left<=this.b.width};I[l].i=function(a){var b;if(f.XMLHttpRequest)b=new XMLHttpRequest;else if(f.ActiveXObject)try{b=new ActiveXObject(x)}catch(c){try{b=new ActiveXObject(w)}catch(e){}}if(!b)return!1;b.open(y,this.c+(-1==this.c.indexOf(u)?u:p)+G+d(this.e));b.setRequestHeader(v,z);b.send(a);return!0};I[l].k=function(){for(var a=[A,B],b=[],c={},e=0;e<a[k];++e)for(var q=g.getElementsByTagName(a[e]),n=0;n<q[k];++n){var r=q[n].getAttribute(F);r&&(q[n].getBoundingClientRect&&this.h(q[n]))&&!(r in c)&&(b.push(r),c[r]=!0)}if(0!=b[k]){a=D+this.d;a+=s+d(b[0]);for(e=1;e<b[k];++e){c=t+d(b[e]);if(131072<a[k]+c[k])break;a+=c}H.criticalImagesBeaconData=a;this.i(a)}};H.j=function(a,b,c){if(a.addEventListener)a.addEventListener(b,c,!1);else if(a.attachEvent)a.attachEvent(E+b,c);else{var e=a[E+b];a[E+b]=function(){c.call(this);e&&e.call(this)}}};H.l=function(a,b,c){var e=new I(a,b,c);H.j(f,C,function(){f.setTimeout(function(){e.k()},0)})};H.criticalImagesBeaconInit=H.l;})();pagespeed.criticalImagesBeaconInit('/mod_pagespeed_beacon','http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster','6VtuHMX1ys');
+//]]></script></body>
+</html>
diff --git a/doc/references/faster-utf8-strlen.html b/doc/references/faster-utf8-strlen.html
new file mode 100644
index 0000000..ac657a5
--- /dev/null
+++ b/doc/references/faster-utf8-strlen.html
@@ -0,0 +1,269 @@
+<html><head>
+<title>Even faster UTF-8 character counting</title>
+</head><body>
+<div class="banner">
+  <h1><a href="/blog/">Daemonic Dispatches</a></h1>
+  Musings from Colin Percival
+</div>
+<div class="nonbanner">
+<div class="content">
+<h2>Even faster UTF-8 character counting</h2>
+I recently came across two articles,
+"<a href="http://canonical.org/~kragen/strlen-utf8.html">Counting
+characters in UTF-8 strings is fast</a>" by Kragen Sitaker, and
+"<a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster">Counting
+characters in UTF-8 strings is fast(er)</a>" by George Pollard, which
+provide a series of successively faster ways of (as the article names
+suggest) counting the number of UTF-8 characters in a NUL-terminated
+string.  We can do better.
+<p>
+Kragen takes the approach of examining each byte in sequence and
+asking if (a) is it the terminating NUL, and (b) is it the first of a
+UTF-8 character.  This last test is quite easy: Bytes <tt>0x01</tt>
+through <tt>0x7F</tt> in UTF-8 represent the corresponding ASCII
+characters, while bytes <tt>0xC0</tt> through <tt>0xFF</tt> are
+the first byte of a multi-byte character.  This results in the
+following inner loop (modulo some style changes to make it easier to
+compare this against later versions):
+<pre>
+	while (s[i]) {
+		if ((s[i] & 0xC0) != 0x80)
+			j++;
+		i++;
+	}
+	return (j);
+</pre>
+<p>
+Kragen continues by comparing this to an optimized version written
+in x86 assembly language by Aristotle Pagaltzis; Aristotle's
+version cleverly takes advantage of the <tt>shl</tt> instruction
+setting the sign, carry, and zero flags, but otherwise applies
+exactly the same algorithm:
+<pre>
+loopa:	dec %ecx
+loopb:	lodsb
+	shl $1, %al
+	js loopa
+	jc loopb
+	jnz loopa
+</pre>
+However, this assembly language version, like Kragen's C version,
+inspects each of the bytes one by one, which inherently limits
+the performance.
+<p>
+George Pollard makes the assumption that the input string is valid
+UTF-8, and notices that by looking at the first byte of a multibyte
+character, we can determine the length of the character: If the
+first byte is between <tt>0xC0</tt> and <tt>0xDF</tt>, the UTF-8
+character has two bytes; if it is between <tt>0xE0</tt> and
+<tt>0xEF</tt>, the UTF-8 character has 3 bytes; and if it is
+<tt>0xF0</tt> and <tt>0xFF</tt>, the UTF-8 character has 4 bytes.
+After reading the first byte of a multibyte character, George
+skips over the trailing bytes.  He also fast-paths the handling
+of ASCII characters, treating characters as signed bytes in order
+to distinguish between ASCII and non-ASCII characters, while
+giving a wonderful example of using a <tt>goto</tt> to jump from
+the middle of one loop into the middle of another:
+<pre>
+	while (s[i] > 0) {
+ascii:
+		i++;
+	}
+
+	count += i - iBefore;
+
+	while (s[i]) {
+		if (s[i] > 0) {
+			iBefore = i;
+			goto ascii;
+		} else {
+			switch (0xF0 & s[i]) {
+			case 0xE0:
+				i += 3;
+				break;
+			case 0xF0:
+				i += 4;
+				break;
+			default:
+				i += 2;
+				break;
+			}
+		}
+
+		count++;
+	}
+</pre>
+While this code is considerably faster than both Kragen's C code
+and Aristotle's assembly code, it suffers from two performance
+limiting factors: First, it uses conditional branches which will
+only be consistently predicted correctly if all of the characters
+encountered have the same length; and second, it still inspects
+characters one by one.
+<p>
+This can be improved in three ways:
+<ul><li>
+Instead of using conditional branches, identify the initial bytes
+of UTF-8 characters using logical operations only.
+</li><li>
+Instead of handling one character at once, vectorize: Handle lots
+of bytes in parallel.
+</li><li>
+In order to reduce the cost of waiting for memory, prefetch data
+if possible.
+</li></ul>
+Making these improvements gave me the following code:
+<pre>
+#define ONEMASK ((size_t)(-1) / 0xFF)
+
+static size_t
+cp_strlen_utf8(const char * _s)
+{
+	const char * s;
+	size_t count = 0;
+	size_t u;
+	unsigned char b;
+
+	/* Handle any initial misaligned bytes. */
+	for (s = _s; (uintptr_t)(s) &amp; (sizeof(size_t) - 1); s++) {
+		b = *s;
+
+		/* Exit if we hit a zero byte. */
+		if (b == '\0')
+			goto done;
+
+		/* Is this byte NOT the first byte of a character? */
+		count += (b &gt;&gt; 7) & ((~b) &gt;&gt; 6);
+	}
+
+	/* Handle complete blocks. */
+	for (; ; s += sizeof(size_t)) {
+		/* Prefetch 256 bytes ahead. */
+		__builtin_prefetch(&amp;s[256], 0, 0);
+
+		/* Grab 4 or 8 bytes of UTF-8 data. */
+		u = *(size_t *)(s);
+
+		/* Exit the loop if there are any zero bytes. */
+		if ((u - ONEMASK) &amp; (~u) &amp; (ONEMASK * 0x80))
+			break;
+
+		/* Count bytes which are NOT the first byte of a character. */
+		u = ((u &amp; (ONEMASK * 0x80)) &gt;&gt; 7) &amp; ((~u) &gt;&gt; 6);
+		count += (u * ONEMASK) &gt;&gt; ((sizeof(size_t) - 1) * 8);
+	}
+
+	/* Take care of any left-over bytes. */
+	for (; ; s++) {
+		b = *s;
+
+		/* Exit if we hit a zero byte. */
+		if (b == '\0')
+			break;
+
+		/* Is this byte NOT the first byte of a character? */
+		count += (b &gt;&gt; 7) &amp; ((~b) &gt;&gt; 6);
+	}
+
+done:
+	return ((s - _s) - count);
+}
+</pre>
+<p>
+How much faster is this?  I put together a
+<a href="strlentest.c">a slightly improved version of Kragen's
+benchmark code</a>, using a buffer filled with valid UTF-8 text
+instead of his more artificial test cases, and ran it on an
+Opteron 848 @ 2.2 GHz running FreeBSD 7.0-RELEASE-p1 after compiling
+with gcc 4.2.1 with the -O3 flag set.  Some notes to help decipher
+the output:
+<ul><li>
+The function names and their meanings are
+<ul><li>
+<tt>gcc_strlen</tt> = the strlen() function as compiled by gcc;
+</li><li>
+<tt>kjs_strlen</tt> = Kragen's non-UTF-8 strlen function;
+</li><li>
+<tt>cp_strlen</tt> = my non-UTF-8 strlen function (not shown here, but 
+see the source code if you're interested);
+</li><li>
+<tt>kjs_strlen_utf8</tt> = Kragen's UTF-8 character counter;
+</li><li>
+<tt>gp_strlen_utf8</tt> = George's UTF-8 character counter; and
+</li><li>
+<tt>cp_strlen_utf8</tt> = my UTF-8 character counter.
+</li></ul>
+</li><li>
+The test strings are "hello, world", "na&#xef;ve", and
+"&#x3053;&#x3093;&#x306b;&#x3061;&#x306f;".
+</li><li>
+The values printed on each line before the colon are the result
+computed -- the number of bytes for <tt>strlen</tt> functions, and the
+number of UTF-8 characters for <tt>strlen_utf8</tt> functions; the
+values after the colon are the mean and standard deviation time taken
+in seconds.
+</li></ul>
+<p>
+The improvement is striking:
+<pre>
+testing 33554424 bytes of repeated "hello, world":
+                      gcc_strlen =   33554424: 0.034169 +/- 0.000090
+                      kjs_strlen =   33554424: 0.049529 +/- 0.000280
+                       cp_strlen =   33554424: 0.011357 +/- 0.000030
+                 kjs_strlen_utf8 =   33554424: 0.060930 +/- 0.000031
+                  gp_strlen_utf8 =   33554424: 0.049675 +/- 0.000294
+                  cp_strlen_utf8 =   33554424: 0.014049 +/- 0.000047
+testing 33554430 bytes of repeated "na?ve":
+                      gcc_strlen =   33554430: 0.034168 +/- 0.000069
+                      kjs_strlen =   33554430: 0.049544 +/- 0.000287
+                       cp_strlen =   33554430: 0.011348 +/- 0.000021
+                 kjs_strlen_utf8 =   27962025: 0.061020 +/- 0.000291
+                  gp_strlen_utf8 =   27962025: 0.059726 +/- 0.000029
+                  cp_strlen_utf8 =   27962025: 0.014041 +/- 0.000043
+testing 33554430 bytes of repeated "?????":
+                      gcc_strlen =   33554430: 0.034157 +/- 0.000088
+                      kjs_strlen =   33554430: 0.049437 +/- 0.000018
+                       cp_strlen =   33554430: 0.011438 +/- 0.000286
+                 kjs_strlen_utf8 =   11184810: 0.060919 +/- 0.000032
+                  gp_strlen_utf8 =   11184810: 0.027454 +/- 0.000031
+                  cp_strlen_utf8 =   11184810: 0.014133 +/- 0.000287
+</pre>
+Not only is vectorized character counting faster than the "look at a
+byte, skip a few" approach, it isn't even close: Even when the
+characters are 3 bytes each (as in the case of
+"&#x3053;&#x3093;&#x306b;&#x3061;&#x306f;"), the vectorized approach
+wins by a factor of 2; and its lead is larger when the skipping
+approach can't skip as many bytes.  Moreover, vectorized character
+counting is only 30% slower than a vectorized strlen and more than
+twice as fast as a non-vectorized strlen -- although given that
+character counting runs at slightly faster than one byte per clock
+cycle, it's not surprising that non-vectorized code can't keep up!
+<p>
+Can we do better?  I think so.  My code uses 64-bit integer registers
+to manipulate 8 bytes at once; this is the same size as MMX registers,
+so those probably won't be very useful, but with SSE2 16 can be
+manipulated at once, which could provide another doubling of the
+performance.
+<p>
+Beyond a doubling?  Well, the first rule of optimization is to start
+by finding a good algorithm -- and any algorithm in which the critical
+path involves counting UTF-8 characters in a 32 megabyte NUL-terminated
+string is doing something wrong.  This is very much a toy problem; but
+the lesson it teaches is worth remembering: Vectorization is good! 
+<p>
+<div class="trailer">
+  Posted at 2008-06-05 09:20 | <a href="2008-06-05-faster-utf8-strlen.html">Permanent link</a> |
+<a href="2008-06-05-faster-utf8-strlen.html#disqus_thread">Comments</a>
+</div>
+
+<span class="post-byline">
+<span class="author publisher-anchor-color"><a href="#" data-action="profile" data-user="25571647" data-role="username">Philippe Verdy</a></span>
+</span>
+<span class="bullet time-ago-bullet" aria-hidden="true">•</span>
+<a href="#comment-787106245" data-role="relative-time" class="time-ago" title="Sunday, February 3 2013 5:49 AM">a year ago</a>
+
+<p>There's a potential bug with the vectorized version, because it is reading data past the end of string in this line:<br>  u = *(size_t *)(s);</p><p>Effectively, if the final zero byte occurs where it is not at end of a 4-byte or 8-byte aligned block, then this will read extra bytes (up to 7 extra bytes when compiling with a 64-bit size_t). In some cases where memory is strictly protected, any attempt to read after this zero byte may mean reafing past the end of buffer and will cause an exception (SEGV).</p><p>As there's no exception handler to detect this, the function will cause the caller to not receive the expected result (it would be safe to catch the exception at this poiint as another way to break the loop instead of depending *only* on the presence of the zero byte). One case where this exception will occur is when the function is used to get the length of a string stored in a protected area, or in a memory-mapped I/O area (where the extra reads may not cause an exception, but could have other impact, or would cause other exception), so the function will only be usable to measure the length in memory allocated in standard memory.</p><p>But even in standard memory (allocated by malloc(), or strings allocated in buffers on the caller stack), there could also exist protection fences enforced by hardware.</p><p>But the issue is even more severe with the prefetch instruction, which attempts to read 256 bytes ahead without knowing if these 256 bytes are in a valid segment usable in the current process ; a SEGV fault is very likely to occur for strings allocated on the heap or on the caller's stack near the top of stack.</p><p>For this reason, there's in fact NO safe way to parallelize the strlen() function as the ONLY condition is the first location of the zero byte. You HAVE TO read bytes one after the other, or make sure that your program allocates all its strings in memory segments that are size_t aligned (but there's no safe way to use the prefetch, unless all your strangs are allocated from buffers that have at least 256 bytes of valid memory allocated after the end of string).</p><p>The role of the prefetch is not to optiize the CPU L3 cache (a 32-byte ahead prefetch would be enough in most cases with current processors) but to anticipate the page load when the segment of memory has been pages out and must be read from disk (but in that case the prefetch will require much more time than the time to process 256 bytes in only 32 loops of the parallelized code that treats 8 bytes on each loop): 32 loops will take just about a few hundreds nanoseconds, but to perform a page-in from disk, it could take a dozen of milliseconds or more (need to page out some other memory), and so you won't avoid the wait time.</p><p>In my opinion the prefetch is absolutely unnecessary as the gain is almost inexistant (except for a *single* line within the CPU L1 data cache, where it will help acelerate loading that line from the L2 data cache and possibly from the unified L3 cache or from a larger external memory cache on the systel bus, if supported by the north bridge between the CPU and the physical RAM, but such external cache almost never exist today, except on costly systems built witjh north bridges hosting multiples CPUs and sharing the same amount of external RAM).</p><p>Note that hardware memory fences are now used in strict systems tuned for security or in some debuggers trying to detect illegal accesses past end of buffers. It is now a reality. It may even happen that some antivirus solutions (or system settings) implement these secure fences on all buffers allocated on stack on in the process local heap. But most memory allocation routines (malloc and so on) are now allocating memory within aligned blocks, because it helps a lot the implementation of free() and of gargage collectors (like the "oops" used in Java VMs to extend the VM size to more than 3GB while still using 32-bit pointers) : it is considerably faster to allocate a few more bytes while preserving the alignment (of course this does not help protectnig memory accesses between multiple threads of the same process, as some debuggers are trying to do for stress tests, but general these extra fences are only protecting from illegal writes, but not from read-only accesses ; as strlen() is only readonly, it will still be safe to assume that read accesses in the same 4-byte or 8-byte block are possible).</p><p>If you want to use prefetch, it will only be safe if those prefetches occur only within the size of the same memory allocator block size, and useful ONLY if this allocator block size is larger than the L1 or L2 CPU cache line (for example if the memory allocator only returns blocks that are 32-byte aligned in size, and the CPU L1 cache line is only 4 bytes or 8 bytes, then you'll save a few nanoseconds if you prefetch the memory at end of the allocator block:</p><p>__builtin_prefetch ((char*)((size_t)s | (size_t)31), 0, 0);</p><p>This way of using prefetches means that it will not always prefetch something if the current loop is already processing the last 4-byte or 8-byte block within the current 32-byte aligned memory block returned by a memory allocator, but care sould be taken that it will not attempt to read past the top of stack if the string was in fact allocated near the top of the caller's stack. To ensure that this top of stack access will never be reached, the strlen() routine could allocate its own dummy local 32-byte buffer (and make sure that an optimizing compiler will not discard this unused local buffer)</p><p>Another note : the gcc documentation says that the prefetch will not fault if the given address is invalid, as long as the the expression to compute this address is valid. However validity does not mean that such preparation (here for read access only according to the second parameter=0, the third parameter=0 being used to tune the persistance or locality of the prefectch in the cache) will not have any undesired side effect (for example when processing strings stored in memory-mapped hardware I/O space). If the default implementation of strlen in gcc standard libraries for C does not use prefetch, it is because no such assumption is made, the standard library is built to be usable in all cases, for all types of memory blocks.</p>
+
+</div>
+
+</div>
+</body></html>