<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>Ben J. Christensen</title>
	<atom:link href="http://benjchristensen.com/feed/" rel="self" type="application/rss+xml" />
	<link>http://benjchristensen.com</link>
	<description></description>
	<lastBuildDate>Fri, 24 May 2013 19:39:31 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='benjchristensen.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://0.gravatar.com/blavatar/a7bf6ab05bce6d423674b5a8bb676139?s=96&#038;d=http%3A%2F%2Fs2.wp.com%2Fi%2Fbuttonw-com.png</url>
		<title>Ben J. Christensen</title>
		<link>http://benjchristensen.com</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://benjchristensen.com/osd.xml" title="Ben J. Christensen" />
	<atom:link rel='hub' href='http://benjchristensen.com/?pushpress=hub'/>
		<item>
		<title>Hystrix in May 2013 ThoughtWorks Tech Radar</title>
		<link>http://benjchristensen.com/2013/05/24/hystrix-in-may-2013-thoughtworks-tech-radar/</link>
		<comments>http://benjchristensen.com/2013/05/24/hystrix-in-may-2013-thoughtworks-tech-radar/#comments</comments>
		<pubDate>Fri, 24 May 2013 19:34:11 +0000</pubDate>
		<dc:creator>Ben Christensen</dc:creator>
				<category><![CDATA[Architecture]]></category>
		<category><![CDATA[Code]]></category>
		<category><![CDATA[Resilience Engineering]]></category>
		<category><![CDATA[Tools]]></category>

		<guid isPermaLink="false">http://benjchristensen.com/?p=593</guid>
		<description><![CDATA[Hystrix was listed in the May 2013 Tech Radar by ThoughtWorks as &#8220;Assess&#8221; under the Tools section. Managing dependencies in distributed systems can become complicated, and is a problem more people are facing with the move to finer-grained micro services. Hystrix is a library for the JVM from Netflix that implements patterns for dealing with [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=593&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p><a href="https://github.com/Netflix/Hystrix">Hystrix</a> was listed in the <a href="http://thoughtworks.fileburst.com/assets/technology-radar-may-2013.pdf">May 2013 Tech Radar</a> by ThoughtWorks as &#8220;Assess&#8221; under the Tools section.</p>
<blockquote><p>Managing dependencies in distributed systems can become<br />
complicated, and is a problem more people are facing with the<br />
move to finer-grained micro services. Hystrix is a library for<br />
the JVM from Netflix that implements patterns for dealing with<br />
downstream failure, offers real-time monitoring of connections,<br />
and caching and batching mechanisms to make inter-service<br />
dependencies more efficient.</p></blockquote>
<p><img src="http://benjchristensen.files.wordpress.com/2013/01/hystrix-logo-tagline-github-link-640.png?&amp;   class="aligncenter size-full" style="border:0!important;" /></p>
<br />  <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=593&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://benjchristensen.com/2013/05/24/hystrix-in-may-2013-thoughtworks-tech-radar/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://2.gravatar.com/avatar/25a69d1e333ff36b77cf01b84b764182?s=96&#38;d=http%3A%2F%2F2.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D96&#38;r=G" medium="image">
			<media:title type="html">benjchristensen</media:title>
		</media:content>

		<media:content url="http://benjchristensen.files.wordpress.com/2013/01/hystrix-logo-tagline-github-link-640.png?width=640&#38;height=181" medium="image" />
	</item>
		<item>
		<title>Functional Reactive Programming in the Netflix API &#8211; QCon London 2013</title>
		<link>http://benjchristensen.com/2013/05/01/functional-reactive-programming-in-the-netflix-api-qcon-london-2013/</link>
		<comments>http://benjchristensen.com/2013/05/01/functional-reactive-programming-in-the-netflix-api-qcon-london-2013/#comments</comments>
		<pubDate>Thu, 02 May 2013 04:26:52 +0000</pubDate>
		<dc:creator>Ben Christensen</dc:creator>
				<category><![CDATA[Architecture]]></category>
		<category><![CDATA[Code]]></category>
		<category><![CDATA[Infrastructure]]></category>
		<category><![CDATA[Performance]]></category>
		<category><![CDATA[Production]]></category>
		<category><![CDATA[Resilience Engineering]]></category>

		<guid isPermaLink="false">http://benjchristensen.com/?p=588</guid>
		<description><![CDATA[I had the opportunity to speak at QCon London 2013 and present Functional Reactive Programming in the Netflix API: Slides on Speakerdeck Video on InfoQ Interview on InfoQ on Resilience at Netflix with Hystrix, Reactive Programming for the JVM with RxJava<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=588&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>I had the opportunity to speak at QCon London 2013 and present <a href="http://qconlondon.com/london-2013/speaker/Ben+Christensen">Functional Reactive Programming in the Netflix API</a>:</p>
<ul>
<li><a href="https://speakerdeck.com/benjchristensen/functional-reactive-programming-in-the-netflix-api-qcon-london-2013">Slides on Speakerdeck</a></li>
<li><a href="http://www.infoq.com/presentations/netflix-functional-rx">Video on InfoQ</a></li>
<li><a href="http://www.infoq.com/interviews/christensen-hystrix-rxjava">Interview on InfoQ on Resilience at Netflix with Hystrix, Reactive Programming for the JVM with RxJava</a></li>
</ul>
<br />  <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=588&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://benjchristensen.com/2013/05/01/functional-reactive-programming-in-the-netflix-api-qcon-london-2013/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://2.gravatar.com/avatar/25a69d1e333ff36b77cf01b84b764182?s=96&#38;d=http%3A%2F%2F2.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D96&#38;r=G" medium="image">
			<media:title type="html">benjchristensen</media:title>
		</media:content>
	</item>
		<item>
		<title>Functional Reactive in the Netflix API with RxJava</title>
		<link>http://benjchristensen.com/2013/05/01/functional-reactive-in-the-netflix-api-with-rxjava/</link>
		<comments>http://benjchristensen.com/2013/05/01/functional-reactive-in-the-netflix-api-with-rxjava/#comments</comments>
		<pubDate>Thu, 02 May 2013 04:08:24 +0000</pubDate>
		<dc:creator>Ben Christensen</dc:creator>
				<category><![CDATA[Architecture]]></category>
		<category><![CDATA[Code]]></category>
		<category><![CDATA[Performance]]></category>
		<category><![CDATA[Production]]></category>

		<guid isPermaLink="false">http://benjchristensen.com/?p=563</guid>
		<description><![CDATA[Originally written for and posted on the Netflix Tech Blog. by Ben Christensen and Jafar Husain Our recent post on optimizing the Netflix API introduced how our web service endpoints are implemented using a &#8220;functional reactive programming&#8221; (FRP) model for composition of asynchronous callbacks from our service layer. This post takes a closer look at [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=563&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>Originally written for and posted on the <a href="http://techblog.netflix.com/2013/02/rxjava-netflix-api.html">Netflix Tech Blog</a>.</p>
<div style="padding-left:10%;padding-right:10%;">
by <a href="https://twitter.com/benjchristensen/">Ben Christensen</a> and <a href="https://twitter.com/jhusain">Jafar Husain</a><br />
</p>
<p>
Our recent post on <a href="http://techblog.netflix.com/2013/01/optimizing-netflix-api.html">optimizing the Netflix API</a>  introduced how our web service endpoints are implemented using a &#8220;functional reactive programming&#8221; (FRP) model for composition of asynchronous callbacks from our service layer.
</p>
<p>This post takes a closer look at how and why we use the FRP model and introduces our open source project RxJava – a Java implementation of <a href="https://rx.codeplex.com">Rx (Reactive Extensions)</a>.
</p>
<h2>Embrace Concurrency</h2>
<p>
Server-side concurrency is needed to effectively reduce network chattiness. Without concurrent execution on the server, a single &#8220;heavy&#8221; client request might not be much better than many &#8220;light&#8221; requests because each network request from a device naturally executes in parallel with other network requests.  If the server-side execution of a collapsed &#8220;heavy&#8221; request does not achieve a similar level of parallel execution it may be slower than the multiple &#8220;light&#8221; requests even accounting for saved network latency.
</p>
<h2>Futures are Expensive to Compose</h2>
<p>
<a href="http://docs.oracle.com/javase/7/docs/api/java/util/concurrent/Future.html">Futures</a> are straight-forward to use for a <a href="https://gist.github.com/4670979">single level</a> of asynchronous execution but they start to add non-trivial complexity when they&#8217;re <a href="https://gist.github.com/4671081">nested</a>.
</p>
<p>Conditional asynchronous execution flows become <a href="https://gist.github.com/4671081#file-futuresb-java-L163">difficult</a> to optimally compose (particularly as latencies of each request vary at runtime) using Futures. It <a href="http://www.amazon.com/gp/product/0321349601?ie=UTF8&amp;tag=none0b69&amp;linkCode=as2&amp;camp=1789&amp;creative=9325&amp;creativeASIN=0321349601">can be done</a> of course, but it quickly becomes complicated (and thus error prone) or prematurely blocks on &#8216;Future.get()&#8217;, eliminating the benefit of asynchronous execution.
</p>
<p><h2>Callbacks Have Their Own Problems</h2>
<p>
Callbacks offer a solution to the tendency to block on Future.get() by not allowing anything to block. They are naturally efficient because they execute when the response is ready.
</p>
<p>
Similar to Futures though, they are easy to use with a single level of asynchronous execution but become <a href="https://gist.github.com/4677544">unwieldy</a> with nested composition.
</p>
<h2>Reactive</h2>
<p>
Functional reactive offers efficient execution and composition by providing a collection of operators capable of filtering, selecting, transforming, combining and composing Observable&#8217;s.
</p>
<p>
The Observable data type can be thought of as a &#8220;push&#8221; equivalent to <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Iterable.html">Iterable</a> which is &#8220;pull&#8221;. With an Iterable, the consumer pulls values from the producer and the thread blocks until those values arrive. By contrast with the Observable type, the producer pushes values to the consumer whenever values are available.  This approach is more flexible, because values can arrive synchronously or asynchronously.
</p>
<p>
The Observable type adds two missing semantics to the Gang of Four&#8217;s <a href="http://en.wikipedia.org/wiki/Observer_pattern">Observer</a> pattern, which are available in the Iterable type:
</p>
<ol>
<li>The ability for the producer to signal to the consumer that there is no more data available.</li>
<li>The ability for the producer to signal to the consumer that an error has occurred.</li>
</ol>
<p>
With these two simple additions, we have unified the Iterable and Observable types. The only difference between them is the direction in which the data flows. This is very important because now any operation we perform on an Iterable, can also be performed on an Observable. Let&#8217;s take a look at an example &#8230;
</p>
<script src="https://gist.github.com/4676544.js"></script>
<h2>Observable Service Layer</h2>
<p>
The Netflix API takes advantage of Rx by making the entire service layer asynchronous (or at least appear so) &#8211; all &#8220;service&#8221; methods return an Observable&lt;T&gt;.
</p>
<p>
Making all return types Observable combined with a functional programming model frees up the service layer implementation to safely use concurrency. It also enables the service layer implementation to:
</p>
<ul>
<li>conditionally return immediately from a cache</li>
<li>block instead of using threads if resources are constrained</li>
<li>use multiple threads</li>
<li>use non-blocking IO</li>
<li>migrate an underlying implementation from network based to in-memory cache</li>
</ul>
<p>
This can all happen without ever changing how client code interacts with or composes responses.
</p>
<p>
In short, client code treats all interactions with the API as asynchronous but the implementation chooses if something is blocking or non-blocking.
</p>
<p>
This next example code demonstrates how a service layer method can choose whether to synchronously return data from an in-memory cache or asynchronously retrieve data from a remote service and callback with the data once retrieved. In both cases the client code consumes it the same way.
</p>
<script src="https://gist.github.com/4675568.js"></script>
<p>
Retaining this level of control in the service layer is a major architectural advantage particularly for maintaining and optimizing functionality over time. Many different endpoint implementations can be coded against an Observable API and they work efficiently and correctly with the current thread or one or more worker threads backing their execution.
</p>
<p>
The following code demonstrates the consumption of an Observable API with a common Netflix use case – a grid of movies:
</p>
<script src="https://gist.github.com/4679253.js"></script>
<p>
That code is declarative and <a href="http://en.wikipedia.org/wiki/Lazy_evaluation">lazy</a> as well as functionally &#8220;pure&#8221; in that no mutation of state is occurring that would cause thread-safety issues.
</p>
<p>
The API Service Layer is now free to change the behavior of the methods &#8216;getListOfLists&#8217;, &#8216;getVideos&#8217;, &#8216;getMetadata&#8217;, &#8216;getBookmark&#8217; and &#8216;getRating&#8217; – some blocking others non-blocking but all consumed the same way.
</p>
<p>
In the example, &#8216;getListOfLists&#8217; pushes each &#8216;VideoList&#8217; object via &#8216;onNext()&#8217; and then &#8216;getVideos()&#8217; operates on that same parent thread. The implementation of that method could however change from blocking to non-blocking and the code would not need to change.
</p>
<h2>RxJava</h2>
<p>
RxJava is our implementation of Rx for the JVM and is available in the <a href="https://github.com/Netflix/RxJava">Netflix repository in Github</a>.
</p>
<p>
It is not yet feature complete with the .Net version of Rx, but what is implemented has been in use for the past year in production within the Netflix API.
</p>
<p>
We are open sourcing the code as version 0.5 as a way to acknowledgement that it&#8217;s not yet feature complete. The outstanding work is logged in the <a href="https://github.com/Netflix/RxJava/issues?milestone=1&amp;state=open">RxJava Issues</a>.
</p>
<p>
Documentation is available on the <a href="https://github.com/Netflix/RxJava/wiki">RxJava Wiki</a> including links to material available on the internet.
</p>
<p>
Some of the goals of RxJava are:
</p>
<ul>
<li>Stay close to the original Rx.Net implementation while adjusting naming conventions and idioms to Java</li>
<li>All contracts of Rx should be the same</li>
<li>Target the JVM not a language. The first languages supported (beyond Java itself) are <a href="https://github.com/Netflix/RxJava/tree/master/language-adaptors/rxjava-groovy">Groovy</a>, <a href="https://github.com/Netflix/RxJava/tree/master/language-adaptors/rxjava-clojure">Clojure</a>, <a href="https://github.com/Netflix/RxJava/tree/master/language-adaptors/rxjava-scala">Scala</a> and <a href="https://github.com/Netflix/RxJava/tree/master/language-adaptors/rxjava-jruby">JRuby</a>. New language adapters can be <a href="https://github.com/Netflix/RxJava/wiki/How-to-Contribute">contributed</a>.</li>
<li>Support Java 5 (to include Android support) and higher with an eventual goal to target a build for Java 8 with its lambda support.
</ul>
<p>
Here is an implementation of one of the examples above but using Clojure instead of Groovy:
</p>
<script src="https://gist.github.com/4676533.js"></script>
<h1>Summary</h1>
<p>
Functional reactive programming with RxJava has enabled Netflix developers to leverage server-side conconcurrency without the typical thread-safety and synchronization concerns. The API service layer implementation has control over concurrency primitives, which enables us to pursue system performance improvements without fear of breaking client code.
</p>
<p>
RxJava is effective on the server for us and it spreads deeper into our code the more we use it.
</p>
<p>
We hope you find the RxJava project as useful as we have and look forward to your contributions.
</p>
</div>
<br />  <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=563&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://benjchristensen.com/2013/05/01/functional-reactive-in-the-netflix-api-with-rxjava/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://2.gravatar.com/avatar/25a69d1e333ff36b77cf01b84b764182?s=96&#38;d=http%3A%2F%2F2.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D96&#38;r=G" medium="image">
			<media:title type="html">benjchristensen</media:title>
		</media:content>
	</item>
		<item>
		<title>Optimizing the Netflix API</title>
		<link>http://benjchristensen.com/2013/04/30/optimizing-the-netflix-api/</link>
		<comments>http://benjchristensen.com/2013/04/30/optimizing-the-netflix-api/#comments</comments>
		<pubDate>Wed, 01 May 2013 04:05:36 +0000</pubDate>
		<dc:creator>Ben Christensen</dc:creator>
				<category><![CDATA[Architecture]]></category>
		<category><![CDATA[Code]]></category>
		<category><![CDATA[Infrastructure]]></category>
		<category><![CDATA[Performance]]></category>
		<category><![CDATA[Production]]></category>
		<category><![CDATA[Resilience Engineering]]></category>

		<guid isPermaLink="false">http://benjchristensen.com/?p=574</guid>
		<description><![CDATA[Originally written for and posted on the Netflix Tech Blog: About a year ago the Netflix API team began redesigning the API to improve performance and enable UI engineering teams within Netflix to optimize client applications for specific devices. Philosophies of the redesign were introduced in a previous post about embracing the differences between the [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=574&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>Originally written for and posted on the <a href="http://techblog.netflix.com/2013/01/optimizing-netflix-api.html">Netflix Tech Blog</a>:</p>
<div style="padding-left:10%;padding-right:10%;">
About a year ago the Netflix API team began redesigning the API to improve performance and enable UI engineering teams within Netflix to optimize client applications for specific devices.  Philosophies of the redesign were introduced in a previous post about <a href="http://techblog.netflix.com/2012/07/embracing-differences-inside-netflix.html">embracing the differences</a> between the different clients and devices.</p>
<p>This post is part one of a series on the architecture of our redesigned API.</p>
<p></p>
<h1>Goals</h1>
<p>
We had multiple goals in creating this system, as follows:</p>
<h3>
Reduce Chattiness</h3>
<p>
One of the key drivers in pursuing the <a href="http://techblog.netflix.com/2011/02/redesigning-netflix-api.html">redesign</a> in the first place was to reduce the chatty nature of our client/server communication, which could be hindering the overall performance of our device implementations.</p>
<p>Due to the generic and granular nature of the original REST-based Netflix API, each call returns only a portion of functionality for a given user experience, requiring client applications to make multiple calls that need to be assembled in order to render a single user experience.  This interaction model is illustrated in the following diagram:<br />
<br />
<img height="265" src="http://benjchristensen.files.wordpress.com/2013/05/90083-request-multi_1252.png?w=626&#038;h=265" width="626" style="border:0!important;" /></p>
<p>
To reduce the chattiness inherent in the REST API, the discrete requests in the diagram above should be collapsed into a single request optimized for a given client.  The benefit is that the device then pays the price of WAN latency once and leverages the low latency and more powerful hardware server-side. As a side effect, this also eliminates redundancies that occur for every incoming request.<br />
<br />
<img height="265" src="http://benjchristensen.files.wordpress.com/2013/05/7d4c6-request-single_1252.png?w=626&#038;h=265" width="626" style="border:0!important;" /></p>
<p>
A single optimized request such as this must embrace server-side parallelism to at least the same level as previously achieved through multiple network requests from the client.  Because the server-side parallelized requests are running in the same network, each one should be more performant than if it was executed from the device.  This must be achieved without each engineer implementing an endpoint needing to become an expert in low-level threading, synchronization, thread-safety, concurrent data structures, non-blocking IO and other such concerns.</p>
<h3>
Distribute API Development</h3>
<p>
A single team should not become a bottleneck nor need to have expertise on every client application to create optimized endpoints.  Rapid innovation through fast, decoupled development cycles across a wide variety of device types and distributed ownership and expertise across teams should be enabled.  Each client application team should be capable of implementing and operating their own endpoints and the corresponding requests/responses.</p>
<h3>
Mitigate Deployment Risks</h3>
<p>
The Netflix API is a Java application running on hundreds of servers processing 2+ billion incoming requests a day for millions of customers around the world.  The system must mitigate risks inherent in enabling rapid and frequent deployment by multiple teams with minimal coordination.</p>
<h3>
Support Multiple Languages</h3>
<p>
Engineers implementing endpoints come from a wide variety of backgrounds with expertise including Javascript, Objective-C, Java, C, C#, Ruby, Python and others.  The system should be able to support multiple languages at the same time.</p>
<h3>
Distribute Operations</h3>
<p>
Each client team will now manage the deployment lifecycle of their own web service endpoints.  Operational tools for monitoring, debugging, testing, canarying and rolling out code must be exposed to a distributed set of teams so teams can operate independently.</p>
<p></p>
<h1>Architecture</h1>
<p>
To achieve the goals above our architecture distilled into a few key points:</p>
<p></p>
<ul>
<li>dynamic polyglot runtime</li>
<li>fully asynchronous service layer</li>
<li>functional reactive programming model</li>
</ul>
<p>The following diagram and subsequent annotations explain the architecture:</p>
<p><img height="620" src="http://benjchristensen.files.wordpress.com/2013/05/72a7d-architecture-overview_1252.png?w=626&#038;h=620" width="626" style="border:0!important;" /><br />
</p>
<h3>
[1] Dynamic Endpoints</h3>
<p>
All new web service endpoints are now dynamically defined at runtime. New endpoints can be developed, tested, canaried and deployed by each client team without coordination (unless they depend on new functionality from the underlying API Service Layer shown at item 5 in which case they would need to wait until after those changes are deployed before pushing their endpoint).</p>
<h3>
[2] Endpoint Code Repository and Management</h3>
<p>
Endpoint code is published to a Cassandra multi-region cluster (globally replicated) via a RESTful Endpoint Management API used by client teams to manage their endpoints.</p>
<h3>
[3] Dynamic Polyglot JVM Language Runtime</h3>
<p>
Any JVM language can be supported so each team can use the language best suited to them.<br />
</p>
<p>The Groovy JVM language was chosen as our first supported language. The existence of first-class functions (closures), list/dictionary syntax, performance and debuggability were all aspects of our decision.  Moreover, Groovy provides syntax comfortable to a wide range of developers, which helps to reduce the learning curve for the first language on the platform.</p>
<h3>
[4 &amp; 5] Asynchronous Java API + Functional Reactive Programming Model</h3>
<p>
Embracing concurrency was a key requirement to achieve performance gains but abstracting away thread-safety and parallel execution implementation details from the client developers was equally important in reducing complexity and speeding up their rate of innovation.  Making the Java API fully asynchronous was the first step as it allows the underlying method implementations to control whether something is executed concurrently or not without the client code changing.  We chose a functional reactive approach to handling composition and conditional flows of asynchronous callbacks. Our implementation is modeled after <a href="https://rx.codeplex.com/">Rx Observables</a>.</p>
<h3>
[6] Hystrix Fault Tolerance</h3>
<p>
As we have described in a <a href="http://techblog.netflix.com/2012/02/fault-tolerance-in-high-volume.html">previous post</a>, all service calls to backend systems are made via the Hystrix fault tolerance layer (which was <a href="http://techblog.netflix.com/2012/11/hystrix.html">recently open sourced</a>, along with its <a href="http://techblog.netflix.com/2012/12/hystrix-dashboard-and-turbine.html">dashboard</a>) that isolates the dynamic endpoints and the API Service Layer from the inevitable failures that occur while executing billions of network calls each day from the API to backend systems.<br />
</p>
<p>The Hystrix layer is inherently mutlti-threaded due to its use of threads for isolating dependencies and thus is leveraged for concurrent execution of blocking calls to backend systems. These asynchronous requests are then composed together via the functional reactive framework.</p>
<h3>
[7] Backend Services and Dependencies</h3>
<p>
The API Service Layer abstracts away all backend services and dependencies behind facades.  As a result, endpoint code accesses “functionality” rather than a “system”.  This allows us to change underlying implementations and architecture with no or limited impact on the code that depends on the API.  For example, if a backend system is split into 2 different services, or 3 are combined into one, or a remote network call is optimized into an in-memory cache, none of these changes should affect endpoint code and thus the API Service Layer ensures that object models and other such tight-couplings are abstracted and not allowed to “leak” into the endpoint code.</p>
<h2>
Summary</h2>
<p>
The new Netflix API architecture is a significant departure from our previous generic RESTful API.<br />
</p>
<p>Dynamic JVM languages combined with an asynchronous Java API and the functional reactive programming model have proven to be a powerful combination to enable safe and efficient development of highly concurrent code.</p>
<p>The end result is a fault-tolerant, performant platform that puts control in the hands of those who know their target applications the best.</p>
<p>Following posts will provide further implementation and operational details about this new architecture.</p>
<p>
</div>
<br />  <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=574&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://benjchristensen.com/2013/04/30/optimizing-the-netflix-api/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://2.gravatar.com/avatar/25a69d1e333ff36b77cf01b84b764182?s=96&#38;d=http%3A%2F%2F2.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D96&#38;r=G" medium="image">
			<media:title type="html">benjchristensen</media:title>
		</media:content>

		<media:content url="http://benjchristensen.files.wordpress.com/2013/05/90083-request-multi_1252.png" medium="image" />

		<media:content url="http://benjchristensen.files.wordpress.com/2013/05/7d4c6-request-single_1252.png" medium="image" />

		<media:content url="http://benjchristensen.files.wordpress.com/2013/05/72a7d-architecture-overview_1252.png" medium="image" />
	</item>
		<item>
		<title>Hystrix Dashboard + Turbine Stream Aggregator</title>
		<link>http://benjchristensen.com/2013/01/15/hystrix-dashboard-turbine-stream-aggregator/</link>
		<comments>http://benjchristensen.com/2013/01/15/hystrix-dashboard-turbine-stream-aggregator/#comments</comments>
		<pubDate>Tue, 15 Jan 2013 20:49:51 +0000</pubDate>
		<dc:creator>Ben Christensen</dc:creator>
				<category><![CDATA[Architecture]]></category>
		<category><![CDATA[Code]]></category>
		<category><![CDATA[Infrastructure]]></category>
		<category><![CDATA[Performance]]></category>
		<category><![CDATA[Production]]></category>
		<category><![CDATA[Resilience Engineering]]></category>
		<category><![CDATA[Tools]]></category>

		<guid isPermaLink="false">http://benjchristensen.com/?p=544</guid>
		<description><![CDATA[Originally posted to the Netflix Tech Blog: by Ben Christensen, Puneet Oberai and Ben Schmaus Two weeks ago&#160;we introduced Hystrix, a library for engineering resilience into distributed systems. Today we&#8217;re open sourcing the Hystrix dashboard application, as well as a new companion project called Turbine that provides low latency event stream aggregation. The Hystrix dashboard [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=544&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>Originally posted to the <a href="http://techblog.netflix.com/2012/12/hystrix-dashboard-and-turbine.html">Netflix Tech Blog</a>:</p>
<div style="padding-left:10%;padding-right:10%;">
by Ben Christensen, Puneet Oberai and Ben Schmaus</p>
<p><a href="http://techblog.netflix.com/2012/11/hystrix.html">Two weeks ago</a>&nbsp;we introduced <a href="https://github.com/Netflix/Hystrix">Hystrix</a>, a library for engineering resilience into distributed systems. Today we&#8217;re open sourcing the Hystrix dashboard application, as well as a new companion project called Turbine that provides low latency event stream aggregation.</p>
<div class="separator" style="clear:both;text-align:center;">
</div>
<p><a href="https://github.com/Netflix/Hystrix"><br />
<img border="0" height="181" src="http://benjchristensen.files.wordpress.com/2013/01/hystrix-logo-tagline-github-link-640.png?w=640&#038;h=181" style="border:0;padding:0;" width="640" /></a></p>
<p>
The Hystrix dashboard has significantly improved our operations by reducing discovery and recovery times during operational events. The duration of most production incidents (already less frequent due to Hystrix) is far shorter, with diminished impact, because we are now able to get realtime insights (1-2 second latency) into system behavior.</p>
<p>The following snapshot shows six HystrixCommands being used by the Netflix API.  Under the hood of this example dashboard, Turbine is aggregating data from 581 servers into a single stream of metrics supporting the dashboard application, which in turn streams the aggregated data to the browser for display in the UI.</p>
<p><img border="0" height="248" src="http://benjchristensen.files.wordpress.com/2013/01/hystrix-dashboard-netflix-api-example-620.png?w=620&#038;h=248" style="border:0;padding:0;" width="620" /></p>
<p>
When a circuit is failing then it changes colors (gradient from green through yellow, orange and red) such as this:</p>
<p><img src="http://benjchristensen.files.wordpress.com/2013/01/dashboard-example-open-circuit-640.png?w=640&#038;h=183" style="border:0;padding:0;" width="640" height="183"></p>
<p>The diagram below shows one &#8220;circuit&#8221; from the dashboard along with explanations of what all of the data represents.</p>
<p>We&#8217;ve purposefully tried to pack a lot of information into the dashboard so that engineers can quickly consume and correlate data.</p>
<p><img border="0" height="411" src="http://benjchristensen.files.wordpress.com/2013/01/dashboard-annoted-circuit-640.png?w=640&#038;h=411" style="border:0;padding:0;" width="640" /></p>
<p>The following video shows the dashboard operating with data from a Netflix API cluster:</p>
<span class='embed-youtube' style='text-align:center; display: block;'><iframe class='youtube-player' type='text/html' width='640' height='360' src='http://www.youtube.com/embed/zWM7oAbVL4g?version=3&#038;rel=1&#038;fs=1&#038;showsearch=0&#038;showinfo=1&#038;iv_load_policy=1&#038;wmode=transparent' frameborder='0'></iframe></span>
<p>The Turbine deployment at Netflix connects to thousands of Hystrix-enabled servers and aggregates realtime streams from them. Netflix uses Turbine with a <a href="https://github.com/Netflix/eureka">Eureka</a> plugin that handles instances joining and leaving clusters (due to autoscaling, red/black deployments, or just being unhealthy).</p>
<p>Our alerting systems have also started migrating to Turbine-powered metrics streams so that in one minute of data there are dozens or hundreds of points of data for a single metric.&nbsp;This high resolution of metrics data makes for better and faster alerting.</p>
<p>The Hystrix dashboard can be used either to monitor an individual instance without Turbine or in conjunction with Turbine to monitor multi-machine clusters:</p>
<p><img border="0" height="380" src="http://benjchristensen.files.wordpress.com/2013/01/dashboard-direct-vs-turbine-640.png?w=640&#038;h=380" style="border:0;padding:0;" width="640" /></p>
<p>Turbine can be found on Github at: <a href="https://github.com/Netflix/Turbine">https://github.com/Netflix/Turbine</a></p>
<p>Dashboard documentation is at: <a href="https://github.com/Netflix/Hystrix/wiki/Dashboard">https://github.com/Netflix/Hystrix/wiki/Dashboard</a></p>
<p>We expect people to want to customize the UI so the javascript modules have been implemented in a way that they can easily be used standalone in existing dashboards and applications. We also expect different perspectives on how to visualize and represent data and look forward to contributions back to both Hystrix and Turbine.</p>
<p>We are always looking for talented engineers so if you&#8217;re interested in this type of work contact us via <a href="http://jobs.netflix.com/jobs.html">jobs.netflix.com</a>.</p>
<p>
</div>
<br />  <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=544&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://benjchristensen.com/2013/01/15/hystrix-dashboard-turbine-stream-aggregator/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://2.gravatar.com/avatar/25a69d1e333ff36b77cf01b84b764182?s=96&#38;d=http%3A%2F%2F2.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D96&#38;r=G" medium="image">
			<media:title type="html">benjchristensen</media:title>
		</media:content>

		<media:content url="http://benjchristensen.files.wordpress.com/2013/01/hystrix-logo-tagline-github-link-640.png" medium="image" />

		<media:content url="http://benjchristensen.files.wordpress.com/2013/01/hystrix-dashboard-netflix-api-example-620.png" medium="image" />

		<media:content url="http://benjchristensen.files.wordpress.com/2013/01/dashboard-example-open-circuit-640.png" medium="image" />

		<media:content url="http://benjchristensen.files.wordpress.com/2013/01/dashboard-annoted-circuit-640.png" medium="image" />

		<media:content url="http://benjchristensen.files.wordpress.com/2013/01/dashboard-direct-vs-turbine-640.png" medium="image" />
	</item>
		<item>
		<title>Hystrix for Resilience Engineering</title>
		<link>http://benjchristensen.com/2012/11/26/hystrix-for-resilience-engineering/</link>
		<comments>http://benjchristensen.com/2012/11/26/hystrix-for-resilience-engineering/#comments</comments>
		<pubDate>Tue, 27 Nov 2012 04:33:56 +0000</pubDate>
		<dc:creator>Ben Christensen</dc:creator>
				<category><![CDATA[Architecture]]></category>
		<category><![CDATA[Code]]></category>
		<category><![CDATA[Production]]></category>
		<category><![CDATA[Production Problems]]></category>
		<category><![CDATA[Resilience Engineering]]></category>

		<guid isPermaLink="false">http://benjchristensen.com/?p=519</guid>
		<description><![CDATA[Today Hystrix was released on GitHub at http://github.com/Netflix/Hystrix. It is a latency and fault tolerance library used for resilience engineering and something I have spent a good chunk of time on at Netflix. I&#8217;m happy to see it get released as open-source and be able to continue evolving it (hopefully with community involvement). As written [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=519&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>Today Hystrix was released on GitHub at <a href="http://github.com/Netflix/Hystrix">http://github.com/Netflix/Hystrix</a>.</p>
<p>It is a latency and fault tolerance library used for resilience engineering and something I have spent a good chunk of time on at Netflix. I&#8217;m happy to see it get released as open-source and be able to continue evolving it (hopefully with community involvement).</p>
<p>As written originally for the <a href="http://techblog.netflix.com/2012/11/hystrix.html">Netflix Tech Blog</a>:</p>
<blockquote><p>In a distributed environment, failure of any given service is inevitable. Hystrix is a library designed to control the interactions between these distributed services providing greater tolerance of latency and failure. Hystrix does this by isolating points of access between the services, stopping cascading failures across them, and providing fallback options, all of which improve the system&#8217;s overall resiliency.</p>
<p>Hystrix evolved out of resilience engineering work that the Netflix API team began in 2011. Over the course of 2012, Hystrix continued to evolve and mature, eventually leading to adoption across many teams within Netflix. Today tens of billions of thread-isolated and hundreds of billions of semaphore-isolated calls are executed via Hystrix every day at Netflix and a dramatic improvement in uptime and resilience has been achieved through its use.</p></blockquote>
<p><img src="http://benjchristensen.files.wordpress.com/2012/11/hystrix-logo-tagline-vertical.png?w=406" alt="" title="hystrix-logo-tagline-vertical"   class="aligncenter size-full wp-image-520" /></p>
<br />  <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=519&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://benjchristensen.com/2012/11/26/hystrix-for-resilience-engineering/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://2.gravatar.com/avatar/25a69d1e333ff36b77cf01b84b764182?s=96&#38;d=http%3A%2F%2F2.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D96&#38;r=G" medium="image">
			<media:title type="html">benjchristensen</media:title>
		</media:content>

		<media:content url="http://benjchristensen.files.wordpress.com/2012/11/hystrix-logo-tagline-vertical.png" medium="image">
			<media:title type="html">hystrix-logo-tagline-vertical</media:title>
		</media:content>
	</item>
		<item>
		<title>Performance and Fault Tolerance for the Netflix API &#8211; QCon Sao Paulo</title>
		<link>http://benjchristensen.com/2012/08/12/performance-and-fault-tolerance-for-the-netflix-api-qcon-sao-paulo/</link>
		<comments>http://benjchristensen.com/2012/08/12/performance-and-fault-tolerance-for-the-netflix-api-qcon-sao-paulo/#comments</comments>
		<pubDate>Sun, 12 Aug 2012 22:24:02 +0000</pubDate>
		<dc:creator>Ben Christensen</dc:creator>
				<category><![CDATA[Architecture]]></category>
		<category><![CDATA[Code]]></category>
		<category><![CDATA[Performance]]></category>
		<category><![CDATA[Production]]></category>

		<guid isPermaLink="false">http://benjchristensen.com/?p=504</guid>
		<description><![CDATA[A presentation I gave at QCon Sao Paulo on August 4th 2012 (http://qconsp.com/palestrante/ben-christensen) Presentation Description The Netflix API receives over a billion requests a day which translates into multiple billions of calls to underlying systems in the Netflix service-oriented architecture. These requests come from more than 800 different devices ranging from gaming consoles like the [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=504&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>A presentation I gave at QCon Sao Paulo on August 4th 2012 (<a href="http://qconsp.com/palestrante/ben-christensen" rel="nofollow">http://qconsp.com/palestrante/ben-christensen</a>)</p>
<p><strong>Presentation Description</strong></p>
<p>The Netflix API receives over a billion requests a day which translates into multiple billions of calls to underlying systems in the Netflix service-oriented architecture. These requests come from more than 800 different devices ranging from gaming consoles like the PS3, XBox and Wii to set-top boxes, TVs and mobile devices such as Android and iOS.</p>
<p>This presentation describes how the Netflix API supports those devices and achieves fault tolerance in a distributed architecture while depending on dozens of systems which can fail at any time. It also explains how a new system design allows each device to optimize API calls to their unique needs and leverage concurrency on the server-side to improve their performance.</p>
<p>(Some slides have been modified and notes included for readability and understanding of content without accompanying speech.)</p>
<p style="text-align:center;"><iframe src='http://www.slideshare.net/slideshow/embed_code/13927636' width='406' height='333'></iframe></p>
<p style="text-align:left;">Slides also available at <a href="https://speakerdeck.com/u/benjchristensen/p/performance-and-fault-tolerance-for-the-netflix-api-qcon-sao-paulo">SpeakerDeck</a>.</p>
<p style="text-align:center;"><img class="aligncenter size-full wp-image-508" src="http://benjchristensen.files.wordpress.com/2012/08/azdpfe3cyaacgwb.jpg?w=406" alt=""   /></p>
<p style="text-align:center;"><img class="aligncenter size-full wp-image-509" src="http://benjchristensen.files.wordpress.com/2012/08/azdmroxcyaey4hz.jpg?w=406" alt=""   /></p>
<p style="text-align:center;"><a href="https://maps.google.com/maps?q=%22Federacao+Comercio+Estado+Sao+Paulo%22,+S%C3%A3o+Paulo,+Brazil&amp;hl=en&amp;ie=UTF8&amp;ll=-23.557654,-46.652498&amp;spn=0.030999,0.065103&amp;sll=-23.558425,-46.653431&amp;sspn=0.041384,0.07699&amp;hq=%22Federacao+Comercio+Estado+Sao+Paulo%22,&amp;hnear=Sao+Paulo+-+S%C3%A3o+Paulo,+Brazil&amp;t=m&amp;z=15&amp;layer=c&amp;cbll=-23.557616,-46.652401&amp;panoid=FGfV2pbo0UZi3fgLfQOSgQ&amp;cbp=12,145.88,,0,-4.96">Conference center on Google Maps</a></p>
<br />  <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=504&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://benjchristensen.com/2012/08/12/performance-and-fault-tolerance-for-the-netflix-api-qcon-sao-paulo/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://2.gravatar.com/avatar/25a69d1e333ff36b77cf01b84b764182?s=96&#38;d=http%3A%2F%2F2.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D96&#38;r=G" medium="image">
			<media:title type="html">benjchristensen</media:title>
		</media:content>

		<media:content url="http://benjchristensen.files.wordpress.com/2012/08/azdpfe3cyaacgwb.jpg" medium="image" />

		<media:content url="http://benjchristensen.files.wordpress.com/2012/08/azdmroxcyaey4hz.jpg" medium="image" />
	</item>
		<item>
		<title>Interactive Line Graph using d3.js</title>
		<link>http://benjchristensen.com/2012/05/15/interactive-line-graph-using-d3-js/</link>
		<comments>http://benjchristensen.com/2012/05/15/interactive-line-graph-using-d3-js/#comments</comments>
		<pubDate>Tue, 15 May 2012 22:57:46 +0000</pubDate>
		<dc:creator>Ben Christensen</dc:creator>
				<category><![CDATA[Code]]></category>
		<category><![CDATA[User Interface]]></category>

		<guid isPermaLink="false">http://benjchristensen.com/?p=500</guid>
		<description><![CDATA[Continuing with the series of examples I&#8217;ve provided while using d3.js, this one is a javascript object which renders data as a line graph which can then be interacted with by scrubbing over it or changing the y-axis scale. Click on the image or the bl.ocks.org link to see it in action. Code: https://gist.github.com/2657838 http://bl.ocks.org/2657838<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=500&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>Continuing with the series of examples I&#8217;ve provided while using d3.js, this one is a javascript object which renders data as a line graph which can then be interacted with by scrubbing over it or changing the y-axis scale.</p>
<p><a href="http://bl.ocks.org/2657838"><img src="http://benjchristensen.files.wordpress.com/2012/05/interactive-line-graph.png?w=800" alt="" title="interactive-line-graph" width="800" height="352" class="aligncenter size-full wp-image-501" /></a></p>
<p>Click on the image or the <a href="http://bl.ocks.org/2657838">bl.ocks.org</a> link to see it in action.</p>
<p>Code: <a href="https://gist.github.com/2657838">https://gist.github.com/2657838</a> <a href="http://bl.ocks.org/2657838">http://bl.ocks.org/2657838</a></p>
<br />  <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=500&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://benjchristensen.com/2012/05/15/interactive-line-graph-using-d3-js/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
	
		<media:content url="http://2.gravatar.com/avatar/25a69d1e333ff36b77cf01b84b764182?s=96&#38;d=http%3A%2F%2F2.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D96&#38;r=G" medium="image">
			<media:title type="html">benjchristensen</media:title>
		</media:content>

		<media:content url="http://benjchristensen.files.wordpress.com/2012/05/interactive-line-graph.png?w=800" medium="image">
			<media:title type="html">interactive-line-graph</media:title>
		</media:content>
	</item>
		<item>
		<title>Line Graphs Using d3.js</title>
		<link>http://benjchristensen.com/2012/05/02/line-graphs-using-d3-js/</link>
		<comments>http://benjchristensen.com/2012/05/02/line-graphs-using-d3-js/#comments</comments>
		<pubDate>Wed, 02 May 2012 20:04:33 +0000</pubDate>
		<dc:creator>Ben Christensen</dc:creator>
				<category><![CDATA[Code]]></category>
		<category><![CDATA[User Interface]]></category>

		<guid isPermaLink="false">http://benjchristensen.com/?p=488</guid>
		<description><![CDATA[Simple examples of line graphs implemented using d3.js: Simple Line Graph http://bl.ocks.org/2579599 https://gist.github.com/2579599 Line Graph with Dual-scaled Axes http://bl.ocks.org/2579619 https://gist.github.com/2579619 Line graph over time with multiple data points http://bl.ocks.org/2580640 https://gist.github.com/2580640 UPDATE: I added an interactive version with scrubbing and dynamic updating.<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=488&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>Simple examples of line graphs implemented using d3.js:</p>
<p><a href="http://bl.ocks.org/2579599"><img class="aligncenter size-full wp-image-489" title="" src="http://benjchristensen.files.wordpress.com/2012/05/line-graph-single-axis.png?w=800" alt="" width="800" height="254" /></a></p>
<p>Simple Line Graph <a href="http://bl.ocks.org/2579599">http://bl.ocks.org/2579599</a> <a href="https://gist.github.com/2579599">https://gist.github.com/2579599</a></p>
<p><a href="//bl.ocks.org/2579619"><img class="aligncenter size-full wp-image-490" title="line-graph-dual-axes" src="http://benjchristensen.files.wordpress.com/2012/05/line-graph-dual-axes.png?w=800" alt="" width="800" height="271" /></a></p>
<p>Line Graph with Dual-scaled Axes <a href="http://bl.ocks.org/2579619">http://bl.ocks.org/2579619</a> <a href="https://gist.github.com/2579619">https://gist.github.com/2579619</a></p>
<p><a href="http://bl.ocks.org/2580640"><img class="aligncenter size-full wp-image-496" title="" src="http://benjchristensen.files.wordpress.com/2012/05/line-graph-over-time.png?w=800" alt="" width="800" height="239" /></a></p>
<p>Line graph over time with multiple data points <a href="http://bl.ocks.org/2580640">http://bl.ocks.org/2580640</a> <a href="https://gist.github.com/2580640">https://gist.github.com/2580640</a></p>
<p>UPDATE: I added an <a href="http://benjchristensen.com/2012/05/15/interactive-line-graph-using-d3-js/">interactive version</a> with scrubbing and dynamic updating.</p>
<br />  <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=488&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://benjchristensen.com/2012/05/02/line-graphs-using-d3-js/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="http://2.gravatar.com/avatar/25a69d1e333ff36b77cf01b84b764182?s=96&#38;d=http%3A%2F%2F2.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D96&#38;r=G" medium="image">
			<media:title type="html">benjchristensen</media:title>
		</media:content>

		<media:content url="http://benjchristensen.files.wordpress.com/2012/05/line-graph-single-axis.png?w=800" medium="image" />

		<media:content url="http://benjchristensen.files.wordpress.com/2012/05/line-graph-dual-axes.png?w=800" medium="image">
			<media:title type="html">line-graph-dual-axes</media:title>
		</media:content>

		<media:content url="http://benjchristensen.files.wordpress.com/2012/05/line-graph-over-time.png?w=800" medium="image" />
	</item>
		<item>
		<title>Fault Tolerance in a High Volume, Distributed System</title>
		<link>http://benjchristensen.com/2012/03/01/fault-tolerance-in-a-high-volume-distributed-system/</link>
		<comments>http://benjchristensen.com/2012/03/01/fault-tolerance-in-a-high-volume-distributed-system/#comments</comments>
		<pubDate>Thu, 01 Mar 2012 18:45:49 +0000</pubDate>
		<dc:creator>Ben Christensen</dc:creator>
				<category><![CDATA[Architecture]]></category>
		<category><![CDATA[Code]]></category>
		<category><![CDATA[Infrastructure]]></category>
		<category><![CDATA[Performance]]></category>
		<category><![CDATA[Production]]></category>
		<category><![CDATA[Production Problems]]></category>
		<category><![CDATA[distributed system]]></category>
		<category><![CDATA[fault tolerance]]></category>
		<category><![CDATA[high volume]]></category>
		<category><![CDATA[resiliency]]></category>
		<category><![CDATA[service oriented architecture]]></category>

		<guid isPermaLink="false">http://benjchristensen.com/?p=465</guid>
		<description><![CDATA[Originally posted on the Netflix Tech Blog: In an earlier post by Ben Schmaus, we shared the principles behind our circuit-breaker implementation. In that post, Ben discusses how the Netflix API interacts with dozens of systems in our service-oriented architecture, which makes the API inherently more vulnerable to any system failures or latencies underneath it [&#8230;]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=465&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
				<content:encoded><![CDATA[<p>Originally posted on the <a href="http://techblog.netflix.com/2012/02/fault-tolerance-in-high-volume.html">Netflix Tech Blog</a>:</p>
<hr />
<p>In an <a href="http://techblog.netflix.com/2011/12/making-netflix-api-more-resilient.html">earlier post</a> by <a href="http://twitter.com/schmaus">Ben Schmaus</a>, we shared the principles behind our circuit-breaker implementation. In that post, Ben discusses how the Netflix API interacts with dozens of systems in our service-oriented architecture, which makes the API inherently more vulnerable to any system failures or latencies underneath it in the stack. The rest of this post provides a more technical deep-dive into how our API and other systems isolate failure, shed load and remain resilient to failures.</p>
<p><span style="font-size:large;font-weight:bold;">Fault Tolerance is a Requirement, Not a Feature</span></p>
<p>The Netflix API receives more than 1 billion incoming calls per day which in turn fans out to several billion outgoing calls (averaging a ratio of 1:6) to dozens of underlying subsystems with peaks of over 100k dependency requests per second.<br />
<a href="http://benjchristensen.files.wordpress.com/2012/03/dependencies1.png"><img class="aligncenter size-full wp-image-469" title="" src="http://benjchristensen.files.wordpress.com/2012/03/dependencies1.png?w=800" alt="" width="800" height="727" border="0" /></a></p>
<p>This all occurs in the cloud across thousands of EC2 instances.</p>
<p>Intermittent failure is guaranteed with this many variables, even if every dependency itself has excellent availability and uptime.</p>
<p>Without taking steps to ensure fault tolerance, 30 dependencies each with 99.99% uptime would result in 2+ hours downtime/month (<span style="font-size:x-small;">99.99%<sup>30</sup> = 99.7% uptime = 2+ hours in a month</span>).</p>
<p>When a single API dependency fails at high volume with increased latency (causing blocked request threads) it can rapidly (seconds or sub-second) saturate all available Tomcat (or other container such as Jetty) request threads and take down the entire API.</p>
<p><a href="http://benjchristensen.files.wordpress.com/2012/03/dependencies3.png"><img class="aligncenter size-full wp-image-471" title="" src="http://benjchristensen.files.wordpress.com/2012/03/dependencies3.png?w=800" alt="" width="800" height="728" /></a></p>
<p>Thus, it is a requirement of high volume, high availability applications to build fault tolerance into their architecture and not expect infrastructure to solve it for them.</p>
<p><span style="font-size:large;font-weight:bold;">Netflix DependencyCommand Implementation</span></p>
<p>The service-oriented architecture at Netflix allows each team freedom to choose the best transport protocols and formats (XML, JSON, Thrift, Protocol Buffers, etc) for their needs so these approaches may vary across services.</p>
<p>In most cases the team providing a service also distributes a Java client library.</p>
<p>Because of this, applications such as API in effect treat the underlying dependencies as 3rd party client libraries whose implementations are &#8220;black boxes&#8221;. This in turn affects how fault tolerance is achieved.</p>
<p>In light of the above architectural considerations we chose to implement a solution that uses a combination of fault tolerance approaches:</p>
<ul>
<li><span style="font-size:100%;">network timeouts and retries</span></li>
<li><span style="font-size:100%;">separate threads on per-dependency thread pools</span></li>
<li><span style="font-size:100%;">semaphores (via a </span><a style="font-size:100%;" href="http://docs.oracle.com/javase/6/docs/api/java/util/concurrent/Semaphore.html#tryAcquire()">tryAcquire</a><span style="font-size:100%;">, not a blocking call)</span></li>
<li><span style="font-size:100%;">circuit breakers</span></li>
</ul>
<p>Each of these approaches to fault-tolerance has pros and cons but when combined together provide a comprehensive protective barrier between user requests and underlying dependencies.</p>
<p><a href="http://benjchristensen.files.wordpress.com/2012/03/faulttolerancetypes.png"><img class="aligncenter size-full wp-image-477" title="" src="http://benjchristensen.files.wordpress.com/2012/03/faulttolerancetypes.png?w=800" alt="" width="800" height="610" /></a></p>
<p>The Netflix DependencyCommand implementation wraps a network-bound dependency call with a preference towards executing in a separate thread and defines fallback logic which gets executed (step 8 in flow chart below) for any failure or rejection (steps 3, 4, 5a, 6b below) regardless of which type of fault tolerance (network or thread timeout, thread pool or semaphore rejection, circuit breaker) triggered it.</p>
<p><a href="http://benjchristensen.files.wordpress.com/2012/03/dependencycommands.png"><img class="aligncenter size-full wp-image-475" title="" src="http://benjchristensen.files.wordpress.com/2012/03/dependencycommands.png?w=800" alt="" width="800" height="325" /></a></p>
<p>We decided that the benefits of isolating dependency calls into separate threads outweighs the drawbacks (in most cases). Also, since the API is progressively <a href="http://techblog.netflix.com/2011/02/redesigning-netflix-api.html">moving towards increased concurrency</a> it was a win-win to achieve both fault tolerance and performance gains through concurrency with the same solution. In other words, the overhead of separate threads is being turned into a positive in many use cases by leveraging the concurrency to execute calls in parallel and speed up delivery of the Netflix experience to users.</p>
<p>Thus, most dependency calls now route through a separate thread-pool as the following diagram illustrates:</p>
<p><a href="http://benjchristensen.files.wordpress.com/2012/03/dependencies4.png"><img class="aligncenter size-full wp-image-472" title="" src="http://benjchristensen.files.wordpress.com/2012/03/dependencies4.png?w=800" alt="" width="800" height="1086" /></a></p>
<p>If a dependency becomes latent (the worst-case type of failure for a subsystem) it can saturate all of the threads in its own thread pool, but Tomcat request threads will timeout or be rejected immediately rather than blocking.</p>
<p><a href="http://benjchristensen.files.wordpress.com/2012/03/dependencies6.png"><img class="aligncenter size-full wp-image-474" title="" src="http://benjchristensen.files.wordpress.com/2012/03/dependencies6.png?w=800" alt="" width="800" height="324" /></a></p>
<p>In addition to the isolation benefits and concurrent execution of dependency calls we have also leveraged the separate threads to enable request collapsing (automatic batching) to increase overall efficiency and reduce user request latencies.</p>
<p>Semaphores are used instead of threads for dependency executions known to not perform network calls (such as those only doing in-memory cache lookups) since the overhead of a separate thread is too high for these types of operations.</p>
<p>We also use semaphores to protect against non-trusted fallbacks. Each DependencyCommand is able to define a fallback function (discussed more below) which is performed on the calling user thread and should not perform network calls. Instead of trusting that all implementations will correctly abide to this contract, it too is protected by a semaphore so that if an implementation is done that involves a network call and becomes latent, the fallback itself won&#8217;t be able to take down the entire app as it will be limited in how many threads it will be able to block.</p>
<p>Despite the use of separate threads with timeouts, we continue to aggressively set timeouts and retries at the network level (through interaction with client library owners, monitoring, audits etc).</p>
<p>The timeouts at the DependencyCommand threading level are the first line of defense regardless of how the underlying dependency client is configured or behaving but the network timeouts are still important otherwise highly latent network calls could fill the dependency thread-pool indefinitely.</p>
<p>The tripping of circuits kicks in when a DependencyCommand has passed a certain threshold of error (such as 50% error rate in a 10 second period) and will then reject all requests until health checks succeed.</p>
<p>This is used primarily to release the pressure on underlying systems (i.e. shed load) when they are having issues and reduce the user request latency by failing fast (or returning a fallback) when we know it is likely to fail instead of making every user request wait for the timeout to occur.</p>
<p><span style="font-size:large;font-weight:bold;">How do we respond to a user request when failure occurs?</span></p>
<p>In each of the options described above a timeout, thread-pool or semaphore rejection, or short-circuit will result in a request not retrieving the optimal response for our customers.</p>
<p>An immediate failure (&#8220;fail fast&#8221;) throws an exception which causes the app to shed load until the dependency returns to health. This is preferable to requests &#8220;piling up&#8221; as it keeps Tomcat request threads available to serve requests from healthy dependencies and enables rapid recovery once failed dependencies recover.</p>
<p>However, there are often several preferable options for providing responses in a &#8220;fallback mode&#8221; to reduce impact of failure on users. Regardless of what causes a failure and how it is intercepted (timeout, rejection, short-circuited etc) the request will always pass through the fallback logic (step 8 in flow chart above) before returning to the user to give a DependencyCommand the opportunity to do something other than &#8220;fail fast&#8221;.</p>
<p>Some approaches to fallbacks we use are, in order of their impact on the user experience:</p>
<ul>
<li><span style="font-size:100%;">Cache: Retrieve data from local or remote caches if the realtime dependency is unavailable, even if the data ends up being stale</span></li>
<li><span style="font-size:100%;">Eventual Consistency: Queue writes (such as in </span><a style="font-size:100%;" href="http://aws.amazon.com/sqs/">SQS</a><span style="font-size:100%;">) to be persisted once the dependency is available again</span></li>
<li><span style="font-size:100%;">Stubbed Data: Revert to default values when personalized options can&#8217;t be retrieved</span></li>
<li><span style="font-size:100%;">Empty Response (&#8220;Fail Silent&#8221;): Return a null or empty list which UIs can then ignore</span></li>
</ul>
<p>All of this work is to maintain maximum uptime for our users while maintaining the maximum number of features for them to enjoy the richest Netflix experience possible. As a result, our goal is to have the fallbacks deliver responses as close to what the actual dependency would deliver.</p>
<p><span style="font-size:large;font-weight:bold;">Example Use Case</span></p>
<p>Following is an example of how threads, network timeouts and retries combine:</p>
<p><a href="http://benjchristensen.files.wordpress.com/2012/03/faulttoleranceexampleconfig.png"><img class="aligncenter size-full wp-image-476" title="" src="http://benjchristensen.files.wordpress.com/2012/03/faulttoleranceexampleconfig.png?w=800" alt="" width="800" height="675" /></a></p>
<p>The above diagram shows an example configuration where the dependency has no reason to hit the 99.5th percentile and thus cuts it short at the network timeout layer and immediately retries with the expectation to get median latency most of the time, and accomplish this all within the 300ms thread timeout.</p>
<p>If the dependency has legitimate reasons to sometimes hit the 99.5th percentile (i.e. cache miss with lazy generation) then the network timeout will be set higher than it, such as at 325ms with 0 or 1 retries and the thread timeout set higher (350ms+).</p>
<p>The threadpool is sized at 10 to handle a burst of 99th percentile requests, but when everything is healthy this threadpool will typically only have 1 or 2 threads active at any given time to serve mostly 40ms median calls.</p>
<p>When configured correctly a timeout at the DependencyCommand layer should be rare, but the protection is there in case something other than network latency affects the time, or the combination of connect+read+retry+connect+read in a worst case scenario still exceeds the configured overall timeout.</p>
<p>The aggressiveness of configurations and tradeoffs in each direction are different for each dependency.</p>
<p>Configurations can be changed in realtime as needed as performance characteristics change or when problems are found all without risking the taking down of the entire app if problems or misconfigurations occur.</p>
<p><span style="font-size:large;font-weight:bold;">Conclusion</span></p>
<p>The approaches discussed in this post have had a dramatic effect on our ability to tolerate and be resilient to system, infrastructure and application level failures without impacting (or limiting impact to) user experience.</p>
<p>Despite the success of this new DependencyCommand resiliency system over the past 8 months, there is still a lot for us to do in improving our fault tolerance strategies and performance, especially as we continue to add functionality, devices, customers and international markets.</p>
<br />  <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=benjchristensen.com&#038;blog=859104&#038;post=465&#038;subd=benjchristensen&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://benjchristensen.com/2012/03/01/fault-tolerance-in-a-high-volume-distributed-system/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://2.gravatar.com/avatar/25a69d1e333ff36b77cf01b84b764182?s=96&#38;d=http%3A%2F%2F2.gravatar.com%2Favatar%2Fad516503a11cd5ca435acc9bb6523536%3Fs%3D96&#38;r=G" medium="image">
			<media:title type="html">benjchristensen</media:title>
		</media:content>

		<media:content url="http://benjchristensen.files.wordpress.com/2012/03/dependencies1.png?w=800" medium="image" />

		<media:content url="http://benjchristensen.files.wordpress.com/2012/03/dependencies3.png?w=800" medium="image" />

		<media:content url="http://benjchristensen.files.wordpress.com/2012/03/faulttolerancetypes.png?w=800" medium="image" />

		<media:content url="http://benjchristensen.files.wordpress.com/2012/03/dependencycommands.png?w=800" medium="image" />

		<media:content url="http://benjchristensen.files.wordpress.com/2012/03/dependencies4.png?w=800" medium="image" />

		<media:content url="http://benjchristensen.files.wordpress.com/2012/03/dependencies6.png?w=800" medium="image" />

		<media:content url="http://benjchristensen.files.wordpress.com/2012/03/faulttoleranceexampleconfig.png?w=800" medium="image" />
	</item>
	</channel>
</rss>
