Some documentation about fast character counting.

This commit is contained in:
King_DuckZ 2014-06-20 19:04:39 +02:00
parent 90968e75cd
commit 9c669480a6
2 changed files with 602 additions and 0 deletions

View file

@ -0,0 +1,333 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title>Counting Characters in UTF-8 Strings Is Fast(er) : porges</title>
<!--[if gte IE 7]><!-->
<link rel="stylesheet" href="http://porg.es/blog/wp-content/themes/manifest_v1.1/style.css" type="text/css" media="screen" charset="utf-8"/>
<!-- <![endif]-->
<!--[if IE 7]>
<link rel="stylesheet" href="http://porg.es/blog/wp-content/themes/manifest_v1.1/style_ie.css" type="text/css" media="screen" charset="utf-8" />
<![endif]-->
<!--[if IE 6]>
<link rel="stylesheet" type="text/css" media="screen" href="http://porg.es/blog/wp-content/themes/manifest_v1.1/styles_ie6.css" />
<![endif]-->
<link rel="alternate" type="application/rss+xml" title="porges RSS Feed" href="http://porg.es/blog/feed"/>
<link rel="alternate" type="application/atom+xml" title="porges Atom Feed" href="http://porg.es/blog/feed/atom"/>
<script type="text/javascript" src="http://porg.es/blog/wp-content/themes/manifest_v1.1/js/jquery-1.3.2.min.js.pagespeed.jm.eWR5IUrZrf.js" charset="utf-8"></script>
<link rel="pingback" href="http://porg.es/blog/xmlrpc.php"/>
<link rel="alternate" type="application/rss+xml" title="porges &raquo; Counting Characters in UTF-8 Strings Is Fast(er) Comments Feed" href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster/feed"/>
<link rel='stylesheet' id='wp-syntax-css-css' href='http://porg.es/blog/wp-content/plugins/wp-syntax/css/A.wp-syntax.css,qver=1.0.pagespeed.cf.S1RLOpa5BE.css' type='text/css' media='all'/>
<script type='text/javascript'>//<![CDATA[
var addComment={moveForm:function(a,b,c,d){var e,f=this,g=f.I(a),h=f.I(c),i=f.I("cancel-comment-reply-link"),j=f.I("comment_parent"),k=f.I("comment_post_ID");if(g&&h&&i&&j){f.respondId=c,d=d||!1,f.I("wp-temp-form-div")||(e=document.createElement("div"),e.id="wp-temp-form-div",e.style.display="none",h.parentNode.insertBefore(e,h)),g.parentNode.insertBefore(h,g.nextSibling),k&&d&&(k.value=d),j.value=b,i.style.display="",i.onclick=function(){var a=addComment,b=a.I("wp-temp-form-div"),c=a.I(a.respondId);if(b&&c)return a.I("comment_parent").value="0",b.parentNode.insertBefore(c,b),b.parentNode.removeChild(b),this.style.display="none",this.onclick=null,!1};try{f.I("comment").focus()}catch(l){}return!1}},I:function(a){return document.getElementById(a)}};
//]]></script>
<link rel="EditURI" type="application/rsd+xml" title="RSD" href="http://porg.es/blog/xmlrpc.php?rsd"/>
<link rel="wlwmanifest" type="application/wlwmanifest+xml" href="http://porg.es/blog/wp-includes/wlwmanifest.xml"/>
<link rel='prev' title='477211307' href='http://porg.es/blog/477211307'/>
<link rel='next' title='Ridiculous UTF-8 character counting' href='http://porg.es/blog/ridiculous-utf-8-character-counting'/>
<meta name="generator" content="WordPress 3.8.3"/>
<link rel='canonical' href='http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster'/>
<link rel='shortlink' href='http://porg.es/blog/?p=130'/>
<!-- analytics tracking -->
<script type="text/javascript">var _gaq=_gaq||[];_gaq.push(['_setAccount','UA-1899042-1']);_gaq.push(['_trackPageview']);(function(){var ga=document.createElement('script');ga.type='text/javascript';ga.async=true;ga.src=('https:'==document.location.protocol?'https://ssl':'http://www')+'.google-analytics.com/ga.js';var s=document.getElementsByTagName('script')[0];s.parentNode.insertBefore(ga,s);})();</script>
</head>
<body class="single single-post postid-130 single-format-standard">
<div id="siteWrapper">
<h1 class="vcard author"><a href="http://porg.es/blog/" title="Home" class="fn">porges</a></h1>
<div id="mainNav">
<ul>
<li class="page_item page-item-105"><a href="http://porg.es/blog/about-me">About Me</a></li>
</ul>
</div>
<div id="siteDescription">
</div>
<div id="coreContent">
<div class="post single hentry">
<div class="postContent">
<h3 class="entry-title">Counting Characters in UTF-8 Strings Is Fast(er)</h3>
<h4 class="vcard author">by <span class="fn">George Pollard</span></h4>
<div class="entry-content">
<p><a href="http://canonical.org/~kragen/strlen-utf8.html">Counting Characters in UTF-8 Strings Is Fast</a> by Kragen Sitaker shows several ways to count characters UTF-8, using both assembly and C. But, with a few assumptions, we can go faster.</p>
<h3>Assumption One: We are dealing with a valid UTF-8 string</h3>
<p>Making this assumption means that once we hit the start of a multi-byte character we can skip forward a few places. It also means we don&#8217;t check for hitting invalid characters (<s>this sends the algorithm into an infinite loop if run on non-valid input</s> it is possible to make the algorithm run past the end of the buffer by supplying malformed data).</p>
<h3>Assumption Two: Most strings are ASCII</h3>
<p>Therefore, run a simple ASCII count routine beforehand. As soon as we hit a non-ASCII character switch into counting UTF-8.</p>
<h3>The code</h3>
<p>Note: The current code relies on chars being signed bytes.</p>
<div class="wp_syntax"><table><tr><td class="code"><pre class="c" style="font-family:monospace;"><span style="color: #993333;">int</span> porges_strlen2<span style="color: #009900;">&#40;</span><span style="color: #993333;">char</span> <span style="color: #339933;">*</span>s<span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #993333;">int</span> i <span style="color: #339933;">=</span> <span style="color: #0000dd;">0</span><span style="color: #339933;">;</span>
&nbsp;
<span style="color: #666666; font-style: italic;">//Go fast if string is only ASCII.</span>
<span style="color: #666666; font-style: italic;">//Loop while not at end of string,</span>
<span style="color: #666666; font-style: italic;">// and not reading anything with highest bit set.</span>
<span style="color: #666666; font-style: italic;">//If highest bit is set, number is negative.</span>
<span style="color: #b1b100;">while</span> <span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span> <span style="color: #339933;">&gt;</span> <span style="color: #0000dd;">0</span><span style="color: #009900;">&#41;</span>
i<span style="color: #339933;">++;</span>
&nbsp;
<span style="color: #b1b100;">if</span> <span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span> <span style="color: #339933;">&lt;=</span> <span style="color: #339933;">-</span><span style="color: #0000dd;">65</span><span style="color: #009900;">&#41;</span> <span style="color: #666666; font-style: italic;">// all follower bytes have values below -65</span>
<span style="color: #b1b100;">return</span> <span style="color: #339933;">-</span><span style="color: #0000dd;">1</span><span style="color: #339933;">;</span> <span style="color: #666666; font-style: italic;">// invalid</span>
&nbsp;
<span style="color: #666666; font-style: italic;">//Note, however, that the following code does *not*</span>
<span style="color: #666666; font-style: italic;">// check for invalid characters.</span>
<span style="color: #666666; font-style: italic;">//The above is just included to bail out on the tests :)</span>
&nbsp;
<span style="color: #993333;">int</span> count <span style="color: #339933;">=</span> i<span style="color: #339933;">;</span>
<span style="color: #b1b100;">while</span> <span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span><span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #666666; font-style: italic;">//if ASCII just go to next character</span>
<span style="color: #b1b100;">if</span> <span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span> <span style="color: #339933;">&gt;</span> <span style="color: #0000dd;">0</span><span style="color: #009900;">&#41;</span> i <span style="color: #339933;">+=</span> <span style="color: #0000dd;">1</span><span style="color: #339933;">;</span>
<span style="color: #b1b100;">else</span>
<span style="color: #666666; font-style: italic;">//select amongst multi-byte starters</span>
<span style="color: #b1b100;">switch</span> <span style="color: #009900;">&#40;</span><span style="color: #208080;">0xF0</span> <span style="color: #339933;">&amp;</span> s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span><span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #b1b100;">case</span> <span style="color: #208080;">0xE0</span><span style="color: #339933;">:</span> i <span style="color: #339933;">+=</span> <span style="color: #0000dd;">3</span><span style="color: #339933;">;</span> <span style="color: #000000; font-weight: bold;">break</span><span style="color: #339933;">;</span>
<span style="color: #b1b100;">case</span> <span style="color: #208080;">0xF0</span><span style="color: #339933;">:</span> i <span style="color: #339933;">+=</span> <span style="color: #0000dd;">4</span><span style="color: #339933;">;</span> <span style="color: #000000; font-weight: bold;">break</span><span style="color: #339933;">;</span>
<span style="color: #b1b100;">default</span><span style="color: #339933;">:</span> i <span style="color: #339933;">+=</span> <span style="color: #0000dd;">2</span><span style="color: #339933;">;</span> <span style="color: #000000; font-weight: bold;">break</span><span style="color: #339933;">;</span>
<span style="color: #009900;">&#125;</span>
<span style="color: #339933;">++</span>count<span style="color: #339933;">;</span>
<span style="color: #009900;">&#125;</span>
<span style="color: #b1b100;">return</span> count<span style="color: #339933;">;</span>
<span style="color: #009900;">&#125;</span></pre></td></tr></table></div>
<h3>Results</h3>
<p>I used Kragens testing code, but removed all <code>strlen</code>s that didnt do UTF-8 counting, and added one test for valid UTF-8 text (just the phrase ‘こんにちは’ repeated). Twice as fast on both the ASCII-only and UTF-8 tests. Improvement on ASCII is due to the ASCII-only routine, and improvement on UTF-8 is due to skipping bytes.</p>
<pre><code>"": 0 0 0 0 0
"hello, world": 12 12 12 12 12
"naïve": 5 5 5 5 5
"こんにちは": 5 5 5 5 5
1: all 'a':
1: porges_strlen2(string) = 33554431: 0.034672
1: ap_strlen_utf8_s(string) = 33554431: 0.068210
1: my_strlen_utf8_c(string) = 33554431: 0.071038
1: my_strlen_utf8_s(string) = 33554431: 0.135856
2: all '\xe3':
2: porges_strlen2(string) = 11184811: 0.032115
2: ap_strlen_utf8_s(string) = 33554431: 0.068228
2: my_strlen_utf8_c(string) = 33554431: 0.071050
2: my_strlen_utf8_s(string) = 33554431: 0.152513
3: all '\x81':
3: porges_strlen2(string) = -1: 0.000001
3: my_strlen_utf8_s(string) = 0: 0.068339
3: ap_strlen_utf8_s(string) = 0: 0.068547
3: my_strlen_utf8_c(string) = 0: 0.071039
4: all konichiwa:
4: porges_strlen2(string) = 11184810: 0.032143
4: ap_strlen_utf8_s(string) = 11184810: 0.068271
4: my_strlen_utf8_c(string) = 11184810: 0.071036
4: my_strlen_utf8_s(string) = 11184810: 0.089478
</code></pre>
<p>Note also that the invalid UTF-8 gives strange results; this is because the algorithm isnt meant to work on it! (The first invalid sequence is a list of 3-byte starters, so the result is divided in 3 due to skipping, and the second is a list of follower bytes, so the code bails out.)</p>
<h3>Going faster</h3>
<p>By dropping back to the ASCII counter whenever we hit ASCII again, we go even faster. This will handle the cases (such as in English) where there are many ASCII characters and only a few multibyte ones.</p>
<div class="wp_syntax"><table><tr><td class="code"><pre class="c" style="font-family:monospace;"><span style="color: #993333;">int</span> porges_strlen2<span style="color: #009900;">&#40;</span><span style="color: #993333;">char</span> <span style="color: #339933;">*</span>s<span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #993333;">int</span> i <span style="color: #339933;">=</span> <span style="color: #0000dd;">0</span><span style="color: #339933;">;</span>
<span style="color: #993333;">int</span> iBefore <span style="color: #339933;">=</span> <span style="color: #0000dd;">0</span><span style="color: #339933;">;</span>
<span style="color: #993333;">int</span> count <span style="color: #339933;">=</span> <span style="color: #0000dd;">0</span><span style="color: #339933;">;</span>
&nbsp;
<span style="color: #b1b100;">while</span> <span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span> <span style="color: #339933;">&gt;</span> <span style="color: #0000dd;">0</span><span style="color: #009900;">&#41;</span>
ascii<span style="color: #339933;">:</span> i<span style="color: #339933;">++;</span>
&nbsp;
count <span style="color: #339933;">+=</span> i<span style="color: #339933;">-</span>iBefore<span style="color: #339933;">;</span>
<span style="color: #b1b100;">while</span> <span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span><span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #b1b100;">if</span> <span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span> <span style="color: #339933;">&gt;</span> <span style="color: #0000dd;">0</span><span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
iBefore <span style="color: #339933;">=</span> i<span style="color: #339933;">;</span>
<span style="color: #b1b100;">goto</span> ascii<span style="color: #339933;">;</span>
<span style="color: #009900;">&#125;</span>
<span style="color: #b1b100;">else</span>
<span style="color: #b1b100;">switch</span> <span style="color: #009900;">&#40;</span><span style="color: #208080;">0xF0</span> <span style="color: #339933;">&amp;</span> s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span><span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #b1b100;">case</span> <span style="color: #208080;">0xE0</span><span style="color: #339933;">:</span> i <span style="color: #339933;">+=</span> <span style="color: #0000dd;">3</span><span style="color: #339933;">;</span> <span style="color: #000000; font-weight: bold;">break</span><span style="color: #339933;">;</span>
<span style="color: #b1b100;">case</span> <span style="color: #208080;">0xF0</span><span style="color: #339933;">:</span> i <span style="color: #339933;">+=</span> <span style="color: #0000dd;">4</span><span style="color: #339933;">;</span> <span style="color: #000000; font-weight: bold;">break</span><span style="color: #339933;">;</span>
<span style="color: #b1b100;">default</span><span style="color: #339933;">:</span> i <span style="color: #339933;">+=</span> <span style="color: #0000dd;">2</span><span style="color: #339933;">;</span> <span style="color: #000000; font-weight: bold;">break</span><span style="color: #339933;">;</span>
<span style="color: #009900;">&#125;</span>
<span style="color: #339933;">++</span>count<span style="color: #339933;">;</span>
<span style="color: #009900;">&#125;</span>
<span style="color: #b1b100;">return</span> count<span style="color: #339933;">;</span>
<span style="color: #009900;">&#125;</span></pre></td></tr></table></div>
<p>But on the konichiwa test the speed improvement happens even though were counting pure multibyte, and Im not sure exactly why&#8230; probably something to do with branch prediction or another arcane CPU topic I dont understand. <img src="http://porg.es/blog/wp-content/plugins/wp-smiley-switcher/noktahhitam/xicon_smile.gif.pagespeed.ic.QDVWtG6ocf.png" alt="" pagespeed_url_hash="666880254"/></p>
<pre><code>4: all konichiwa:
4: porges_strlen2(string) = 11184810: 0.026017
4: ap_strlen_utf8_s(string) = 11184810: 0.068320
4: my_strlen_utf8_c(string) = 11184810: 0.071035
4: my_strlen_utf8_s(string) = 11184810: 0.089464
5: mixed english:
5: porges_strlen2(string) = 32435949: 0.040342
5: my_strlen_utf8_c(string) = 32435949: 0.071035
5: ap_strlen_utf8_s(string) = 32435949: 0.078233
5: my_strlen_utf8_s(string) = 32435949: 0.160676</code></pre>
<p>Without the drop-back-to-ASCII modification:</p>
<pre><code>5: mixed english:
5: porges_strlen2(string) = 32435949: 0.067753</code></pre>
</div>
</div>
<div class="postMeta">
<div class="postDate"><span>Published:</span> <abbr class="published" title="2008-06-04T17:34:57+0000"><a href="http://porg.es/blog/2008/06/04">June 4, 2008</a></abbr></div>
<div class="categories"><span>Filed Under:</span> <a href="http://porg.es/blog/category/code" title="View all posts in code" rel="category tag">code</a></div>
<span>Tags:</span> <a href="http://porg.es/blog/tag/c" rel="tag">C</a> : <a href="http://porg.es/blog/tag/code" rel="tag">code</a> : <a href="http://porg.es/blog/tag/fast" rel="tag">fast</a> : <a href="http://porg.es/blog/tag/speed" rel="tag">speed</a> : <a href="http://porg.es/blog/tag/strings" rel="tag">strings</a> : <a href="http://porg.es/blog/tag/strlen" rel="tag">strlen</a> : <a href="http://porg.es/blog/tag/utf8" rel="tag">utf8</a>
</div>
</div>
<!-- You can start editing here. -->
<div id="comments">
<h3 id="comments">6 Responses to &#8220;Counting Characters in UTF-8 Strings Is Fast(er)&#8221;</h3>
<div class="navigation">
<div class="alignleft"></div>
<div class="alignright"></div>
</div>
<ol class="commentlist">
<li class="comment even thread-even depth-1" id="comment-71725">
<div id="div-comment-71725" class="comment-body">
<div class="comment-author vcard">
<img alt='' src='http://0.gravatar.com/avatar/67d17db3077b5f34fa8798445546d86a?s=48&amp;d=http%3A%2F%2F0.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D48&amp;r=X' class='avatar avatar-48 photo' height='48' width='48' pagespeed_url_hash="1353091553"/> <cite class="fn">matthew</cite> <span class="says">says:</span> </div>
<div class="comment-meta commentmetadata"><a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster/comment-page-1#comment-71725">
June 4, 2008 at 7:15 pm</a> </div>
<p>BTW, his name is Kragen, not Ragen.</p>
<div class="reply">
</div>
</div>
</li><!-- #comment-## -->
<li class="comment byuser comment-author-administrator bypostauthor odd alt thread-odd thread-alt depth-1" id="comment-71727">
<div id="div-comment-71727" class="comment-body">
<div class="comment-author vcard">
<img alt='' src='http://0.gravatar.com/avatar/4ee697b9dc2411d50fb66a4f330ba4df?s=48&amp;d=http%3A%2F%2F0.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D48&amp;r=X' class='avatar avatar-48 photo' height='48' width='48' pagespeed_url_hash="152127690"/> <cite class="fn"><a href='http://porg.es/blog/' rel='external' class='url'>Porges</a></cite> <span class="says">says:</span> </div>
<div class="comment-meta commentmetadata"><a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster/comment-page-1#comment-71727">
June 4, 2008 at 7:55 pm</a> </div>
<p>Whoops <img src="http://porg.es/blog/wp-content/plugins/wp-smiley-switcher/noktahhitam/xicon_smile.gif.pagespeed.ic.QDVWtG6ocf.png" alt="" pagespeed_url_hash="666880254"/></p>
<p>I think the URL must have tripped me up; Im so used to Bob Smith being /~bsmith/&#8230;</p>
<div class="reply">
</div>
</div>
</li><!-- #comment-## -->
<li class="comment even thread-even depth-1" id="comment-71757">
<div id="div-comment-71757" class="comment-body">
<div class="comment-author vcard">
<img alt='' src='http://0.gravatar.com/avatar/23fa05e98b38b413b8cab2897f084621?s=48&amp;d=http%3A%2F%2F0.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D48&amp;r=X' class='avatar avatar-48 photo' height='48' width='48' pagespeed_url_hash="2557715616"/> <cite class="fn">Savvu</cite> <span class="says">says:</span> </div>
<div class="comment-meta commentmetadata"><a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster/comment-page-1#comment-71757">
June 5, 2008 at 2:00 am</a> </div>
<p>while(*s) cnt += tbl[*s++ &gt;&gt; 4]; return cnt;</p>
<p>Setting up tbl is left as an excercise to the reader. If your chars are signed you also need an AND mask.</p>
<div class="reply">
</div>
</div>
</li><!-- #comment-## -->
<li class="comment byuser comment-author-administrator bypostauthor odd alt thread-odd thread-alt depth-1" id="comment-71817">
<div id="div-comment-71817" class="comment-body">
<div class="comment-author vcard">
<img alt='' src='http://0.gravatar.com/avatar/4ee697b9dc2411d50fb66a4f330ba4df?s=48&amp;d=http%3A%2F%2F0.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D48&amp;r=X' class='avatar avatar-48 photo' height='48' width='48' pagespeed_url_hash="152127690"/> <cite class="fn"><a href='http://porg.es/blog/' rel='external' class='url'>Porges</a></cite> <span class="says">says:</span> </div>
<div class="comment-meta commentmetadata"><a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster/comment-page-1#comment-71817">
June 5, 2008 at 12:47 pm</a> </div>
<p>Hi Savvu, I implemented this as:</p>
<div class="wp_syntax"><table><tr><td class="code"><pre class="c" style="font-family:monospace;"><span style="color: #993333;">int</span> tbl<span style="color: #009900;">&#91;</span><span style="color: #009900;">&#93;</span> <span style="color: #339933;">=</span> <span style="color: #009900;">&#123;</span>
<span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">0</span><span style="color: #339933;">,</span><span style="color: #0000dd;">0</span><span style="color: #339933;">,</span><span style="color: #0000dd;">0</span><span style="color: #339933;">,</span><span style="color: #0000dd;">0</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span>
<span style="color: #009900;">&#125;</span><span style="color: #339933;">;</span>
&nbsp;
<span style="color: #993333;">int</span> savvu_strlen<span style="color: #009900;">&#40;</span><span style="color: #993333;">char</span> <span style="color: #339933;">*</span>s<span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #993333;">int</span> cnt <span style="color: #339933;">=</span> <span style="color: #0000dd;">0</span><span style="color: #339933;">;</span>
<span style="color: #b1b100;">while</span><span style="color: #009900;">&#40;</span><span style="color: #339933;">*</span>s<span style="color: #009900;">&#41;</span> cnt <span style="color: #339933;">+=</span> tbl<span style="color: #009900;">&#91;</span><span style="color: #009900;">&#40;</span><span style="color: #339933;">*</span>s<span style="color: #339933;">++</span> <span style="color: #339933;">&gt;&gt;</span> <span style="color: #0000dd;">4</span><span style="color: #009900;">&#41;</span> <span style="color: #339933;">&amp;</span> <span style="color: #208080;">0xF</span><span style="color: #009900;">&#93;</span><span style="color: #339933;">;</span>
<span style="color: #b1b100;">return</span> cnt<span style="color: #339933;">;</span>
<span style="color: #009900;">&#125;</span></pre></td></tr></table></div>
<p>It is consistently the slowest or second-to slowest.</p>
<p>I tried implementing it with byte-skipping:</p>
<div class="wp_syntax"><table><tr><td class="code"><pre class="c" style="font-family:monospace;"><span style="color: #993333;">int</span> tbl<span style="color: #009900;">&#91;</span><span style="color: #009900;">&#93;</span> <span style="color: #339933;">=</span> <span style="color: #009900;">&#123;</span>
<span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span> <span style="color: #666666; font-style: italic;">//one-byte</span>
<span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span><span style="color: #0000dd;">1</span><span style="color: #339933;">,</span> <span style="color: #666666; font-style: italic;">//invalid, but don't go into infinite loop</span>
<span style="color: #0000dd;">2</span><span style="color: #339933;">,</span><span style="color: #0000dd;">2</span><span style="color: #339933;">,</span> <span style="color: #666666; font-style: italic;">//two-byte starter</span>
<span style="color: #0000dd;">3</span><span style="color: #339933;">,</span> <span style="color: #666666; font-style: italic;">//three-byte starter</span>
<span style="color: #0000dd;">4</span> <span style="color: #666666; font-style: italic;">//four-byte starter</span>
<span style="color: #009900;">&#125;</span><span style="color: #339933;">;</span>
&nbsp;
<span style="color: #993333;">int</span> porges_strlen<span style="color: #009900;">&#40;</span><span style="color: #993333;">char</span> <span style="color: #339933;">*</span>s<span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #993333;">int</span> cnt <span style="color: #339933;">=</span> <span style="color: #0000dd;">0</span><span style="color: #339933;">;</span>
<span style="color: #993333;">int</span> i <span style="color: #339933;">=</span> <span style="color: #0000dd;">0</span><span style="color: #339933;">;</span>
<span style="color: #b1b100;">while</span><span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span><span style="color: #009900;">&#41;</span> <span style="color: #009900;">&#123;</span> i <span style="color: #339933;">+=</span> tbl<span style="color: #009900;">&#91;</span><span style="color: #009900;">&#40;</span>s<span style="color: #009900;">&#91;</span>i<span style="color: #009900;">&#93;</span> <span style="color: #339933;">&gt;&gt;</span> <span style="color: #0000dd;">4</span><span style="color: #009900;">&#41;</span> <span style="color: #339933;">&amp;</span> <span style="color: #208080;">0x0f</span><span style="color: #009900;">&#93;</span><span style="color: #339933;">;</span> <span style="color: #339933;">++</span>cnt<span style="color: #339933;">;</span> <span style="color: #009900;">&#125;</span>
<span style="color: #b1b100;">return</span> cnt<span style="color: #339933;">;</span>
<span style="color: #009900;">&#125;</span></pre></td></tr></table></div>
<p>This version is only faster on the byte-skipping tests, and is still about half the speed of what I posted.</p>
<div class="reply">
</div>
</div>
</li><!-- #comment-## -->
<li class="comment even thread-even depth-1" id="comment-71853">
<div id="div-comment-71853" class="comment-body">
<div class="comment-author vcard">
<img alt='' src='http://1.gravatar.com/avatar/b86a032cdc22e3758a45d4e411eb9782?s=48&amp;d=http%3A%2F%2F1.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D48&amp;r=X' class='avatar avatar-48 photo' height='48' width='48' pagespeed_url_hash="2598894441"/> <cite class="fn"><a href='http://www.daemonology.net/blog/' rel='external' class='url'>Colin Percival</a></cite> <span class="says">says:</span> </div>
<div class="comment-meta commentmetadata"><a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster/comment-page-1#comment-71853">
June 5, 2008 at 9:24 pm</a> </div>
<p>I&#8217;ve done even better. <img src="http://porg.es/blog/wp-content/plugins/wp-smiley-switcher/noktahhitam/xicon_smile.gif.pagespeed.ic.QDVWtG6ocf.png" alt="" pagespeed_url_hash="666880254"/></p>
<p>Vectorization yields a 2-4x speedup over your code: <a href="http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html">http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html</a></p>
<div class="reply">
</div>
</div>
</li><!-- #comment-## -->
<li class="pingback odd alt thread-odd thread-alt depth-1" id="comment-85580">
<div id="div-comment-85580" class="comment-body">
<div class="comment-author vcard">
<cite class="fn"><a href='http://initiative.yo2.cn/archives/634387' rel='external' class='url'>几个汇编/C高性能处理UTF-8的帖子</a></cite> <span class="says">says:</span> </div>
<div class="comment-meta commentmetadata"><a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster/comment-page-1#comment-85580">
November 17, 2008 at 11:40 pm</a> </div>
<p>[...] COUNTING CHARACTERS IN UTF-8 STRINGS IS FAST(ER) <a href="http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster">http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster</a> [...]</p>
<div class="reply">
</div>
</div>
</li><!-- #comment-## -->
</ol>
<div class="navigation">
<div class="alignleft"></div>
<div class="alignright"></div>
</div>
<div id="respond">
<form action="http://porg.es/blog/wp-comments-post.php" method="post" id="commentform">
<div class="leaveComment">
<fieldset>
<legend><span>Leave a Comment</span></legend>
<div class="commentForm">
<div class="commentAuthorInfo">
<label>Name: <em>Required</em> <input type="text" name="author" id="author" value=""/></label>
<label>Email: <em>Required, not published</em> <input type="text" name="email" id="email" value=""/></label>
<label>Homepage: <input type="text" name="url" id="url" value=""/></label>
</div>
<label>Comment:
<textarea name="comment" id="comment" cols="50" rows="20"></textarea></label>
<div id="cancel-comment-reply">
<small><a rel="nofollow" id="cancel-comment-reply-link" href="/blog/counting-characters-in-utf-8-strings-is-faster#respond" style="display:none;">Cancel Reply</a></small>
</div>
<input type="submit" value="Post Comment"/> <input type="hidden" name="comment_post_ID" value="130"/>
</div>
</fieldset>
</div>
<input type='hidden' name='comment_post_ID' value='130' id='comment_post_ID'/>
<input type='hidden' name='comment_parent' id='comment_parent' value='0'/>
<p style="display: none;"><input type="hidden" id="akismet_comment_nonce" name="akismet_comment_nonce" value="65ee253bdd"/></p>
</form>
</div>
</div>
<div class="pageNav">
<div class="prev"><a href="http://porg.es/blog/477211307" rel="prev">&laquo; Previous Post</a></div>
<div class="next"><a href="http://porg.es/blog/ridiculous-utf-8-character-counting" rel="next">Next Post &raquo;</a></div>
</div>
</div>
</div>
<div id="footer">
<!-- Footer Links -->
<h5>Elsewhere</h5>
<ul class="elsewhere">
</ul>
<!-- Search Field -->
<div class="footerContent">
<form method="get" id="searchform" action="http://porg.es/blog/">
<div id="search">
<input type="text" value="" name="s" id="s"/>
<input type="submit" id="searchsubmit" value="Search"/>
</div>
</form>
<p>&copy; porges. Powered by <a href="http://wordpress.org/">WordPress</a> and <a href="http://jimbarraud.com/manifest/">Manifest</a></p>
</div>
</div>
<script type="text/javascript">//<![CDATA[
(function(){var d=encodeURIComponent,f=window,g=document,h="documentElement",k="length",l="prototype",m="body",p="&",s="&ci=",t=",",u="?",v="Content-Type",w="Microsoft.XMLHTTP",x="Msxml2.XMLHTTP",y="POST",z="application/x-www-form-urlencoded",A="img",B="input",C="load",D="oh=",E="on",F="pagespeed_url_hash",G="url=";f.pagespeed=f.pagespeed||{};var H=f.pagespeed,I=function(a,b,c){this.c=a;this.e=b;this.d=c;this.b=this.f();this.a={}};I[l].f=function(){return{height:f.innerHeight||g[h].clientHeight||g[m].clientHeight,width:f.innerWidth||g[h].clientWidth||g[m].clientWidth}};I[l].g=function(a){a=a.getBoundingClientRect();return{top:a.top+(void 0!==f.pageYOffset?f.pageYOffset:(g[h]||g[m].parentNode||g[m]).scrollTop),left:a.left+(void 0!==f.pageXOffset?f.pageXOffset:(g[h]||g[m].parentNode||g[m]).scrollLeft)}};I[l].h=function(a){if(0>=a.offsetWidth&&0>=a.offsetHeight)return!1;a=this.g(a);var b=a.top.toString()+t+a.left.toString();if(this.a.hasOwnProperty(b))return!1;this.a[b]=!0;return a.top<=this.b.height&&a.left<=this.b.width};I[l].i=function(a){var b;if(f.XMLHttpRequest)b=new XMLHttpRequest;else if(f.ActiveXObject)try{b=new ActiveXObject(x)}catch(c){try{b=new ActiveXObject(w)}catch(e){}}if(!b)return!1;b.open(y,this.c+(-1==this.c.indexOf(u)?u:p)+G+d(this.e));b.setRequestHeader(v,z);b.send(a);return!0};I[l].k=function(){for(var a=[A,B],b=[],c={},e=0;e<a[k];++e)for(var q=g.getElementsByTagName(a[e]),n=0;n<q[k];++n){var r=q[n].getAttribute(F);r&&(q[n].getBoundingClientRect&&this.h(q[n]))&&!(r in c)&&(b.push(r),c[r]=!0)}if(0!=b[k]){a=D+this.d;a+=s+d(b[0]);for(e=1;e<b[k];++e){c=t+d(b[e]);if(131072<a[k]+c[k])break;a+=c}H.criticalImagesBeaconData=a;this.i(a)}};H.j=function(a,b,c){if(a.addEventListener)a.addEventListener(b,c,!1);else if(a.attachEvent)a.attachEvent(E+b,c);else{var e=a[E+b];a[E+b]=function(){c.call(this);e&&e.call(this)}}};H.l=function(a,b,c){var e=new I(a,b,c);H.j(f,C,function(){f.setTimeout(function(){e.k()},0)})};H.criticalImagesBeaconInit=H.l;})();pagespeed.criticalImagesBeaconInit('/mod_pagespeed_beacon','http://porg.es/blog/counting-characters-in-utf-8-strings-is-faster','6VtuHMX1ys');
//]]></script></body>
</html>

File diff suppressed because one or more lines are too long