lib/Transforms/NaCl/LowerEmAsyncify.cpp - external/github.com/kripken/emscripten-fastcomp - Git at Google

 //===- LowerEmAsyncify - transform asynchronous functions for Emscripten/JS   -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // Lu Wang <coolwanglu@gmail.com>
 //
 // In JS we don't have functions like sleep(), which is on the other hand very popuar in C/C++ etc.
 // This pass tries to convert funcitons calling sleep() into a valid form in JavaScript
 // The basic idea is to split the callee at the place where sleep() is called,
 // then the first half may schedule the second half using setTimeout.
 // But we need to pay lots of attention to analyzing/saving/restoring context variables and return values
 //
 //===----------------------------------------------------------------------===//

 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/Transforms/NaCl.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h" // for DemoteRegToStack, removeUnreachableBlocks
 #include "llvm/Transforms/Utils/PromoteMemToReg.h" // for PromoteMemToReg
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Pass.h"

 #include <vector>

 #ifdef NDEBUG
 #undef assert
 #define assert(x) { if (!(x)) report_fatal_error(#x); }
 #endif

 using namespace llvm;

 static cl::list<std::string>
 AsyncifyFunctions("emscripten-asyncify-functions",
                   cl::desc("Functions that call one of these functions, directly or indirectly, will be asyncified"),
                   cl::CommaSeparated);

 static cl::list<std::string>
 AsyncifyWhiteList("emscripten-asyncify-whitelist",
                   cl::desc("Functions that should not be asyncified"),
                   cl::CommaSeparated);

 namespace {
   class LowerEmAsyncify: public ModulePass {
     Module *TheModule;

   public:
     static char ID; // Pass identification, replacement for typeid
     explicit LowerEmAsyncify() : ModulePass(ID), TheModule(NULL) {
       initializeLowerEmAsyncifyPass(*PassRegistry::getPassRegistry());
     }
     virtual ~LowerEmAsyncify() { }
     bool runOnModule(Module &M);

   private:
     const DataLayout *DL;

     Type *Void, *I1, *I32, *I32Ptr;
     FunctionType *VFunction, *I1Function, *I32PFunction;
     FunctionType *VI32PFunction, *I32PI32Function;
     FunctionType *CallbackFunctionType;

     Function *AllocAsyncCtxFunction, *ReallocAsyncCtxFunction, *FreeAsyncCtxFunction;
     Function *CheckAsyncFunction;
     Function *DoNotUnwindFunction, *DoNotUnwindAsyncFunction;
     Function *GetAsyncReturnValueAddrFunction;

     void initTypesAndFunctions(void);

     typedef std::vector<Instruction *> Instructions;
     typedef DenseMap<Function*, Instructions> FunctionInstructionsMap;
     typedef std::vector<Value*> Values;
     typedef SmallPtrSet<BasicBlock*, 16> BasicBlockSet;

     // all the information we want for an async call
     struct AsyncCallEntry {
       Instruction *AsyncCallInst; // calling an async function
       BasicBlock *AfterCallBlock; // the block we should continue on after getting the return value of AsynCallInst
       CallInst *AllocAsyncCtxInst;  // where we allocate the async ctx before the async call, in the original function
       Values ContextVariables; // those need to be saved and restored for the async call
       StructType *ContextStructType; // The structure constructing all the context variables
       BasicBlock *SaveAsyncCtxBlock; // the block in which we save all the variables
       Function *CallbackFunc; // the callback function for this async call, which is converted from the original function
     };

     BasicBlockSet FindReachableBlocksFrom(BasicBlock *src);

     // Find everything that we should save and restore for the async call
     // save them to Entry.ContextVariables
     void FindContextVariables(AsyncCallEntry & Entry);

     // The essential function
     // F is now in the sync form, transform it into an async form that is valid in JS
     void transformAsyncFunction(Function &F, Instructions const& AsyncCalls);

     bool IsFunctionPointerCall(const Instruction *I);
   };
 }

 char LowerEmAsyncify::ID = 0;
 INITIALIZE_PASS(LowerEmAsyncify, "loweremasyncify",
     "Lower async functions for js/emscripten",
     false, false)

 bool LowerEmAsyncify::runOnModule(Module &M) {
   TheModule = &M;
   DL = &M.getDataLayout();

   std::set<std::string> WhiteList(AsyncifyWhiteList.begin(), AsyncifyWhiteList.end());

   /*
    * collect all the functions that should be asyncified
    * any function that _might_ call an async function is also async
    */
   std::vector<Function*> AsyncFunctionsPending;
   for(unsigned i = 0; i < AsyncifyFunctions.size(); ++i) {
     std::string const& AFName = AsyncifyFunctions[i];
     Function *F = TheModule->getFunction(AFName);
     if (F && !WhiteList.count(F->getName())) {
       AsyncFunctionsPending.push_back(F);
     }
   }

   // No function needed to transform
   if (AsyncFunctionsPending.empty()) return false;

   // Walk through the call graph and find all the async functions
   FunctionInstructionsMap AsyncFunctionCalls;
   {
     // pessimistic: consider all indirect calls as possibly async
     // TODO: deduce based on function types
     for (Module::iterator FI = TheModule->begin(), FE = TheModule->end(); FI != FE; ++FI) {
       if (WhiteList.count(FI->getName())) continue;

       bool has_indirect_call = false;
       for (inst_iterator I = inst_begin(FI), E = inst_end(FI); I != E; ++I) {
         if (IsFunctionPointerCall(&*I)) {
           has_indirect_call = true;
           AsyncFunctionCalls[FI].push_back(&*I);
         }
       }

       if (has_indirect_call) AsyncFunctionsPending.push_back(FI);
     }

     while (!AsyncFunctionsPending.empty()) {
       Function *CurFunction = AsyncFunctionsPending.back();
       AsyncFunctionsPending.pop_back();

       for (Value::user_iterator UI = CurFunction->user_begin(), E = CurFunction->user_end(); UI != E; ++UI) {
         ImmutableCallSite ICS(*UI);
         if (!ICS) continue;
         // we only need those instructions calling the function
         // if the function address is used for other purpose, we don't care
         if (CurFunction != ICS.getCalledValue()->stripPointerCasts()) continue;
         // Now I is either CallInst or InvokeInst
         Instruction *I = cast<Instruction>(*UI);
         Function *F = I->getParent()->getParent();
         if (AsyncFunctionCalls.count(F) == 0) {
           AsyncFunctionsPending.push_back(F);
         }
         AsyncFunctionCalls[F].push_back(I);
       }
     }
   }

   // exit if no async function is found at all
   if (AsyncFunctionCalls.empty()) return false;

   initTypesAndFunctions();

   for (FunctionInstructionsMap::iterator I = AsyncFunctionCalls.begin(), E = AsyncFunctionCalls.end();
       I != E; ++I) {
     transformAsyncFunction(*(I->first), I->second);
   }

   return true;
 }

 void LowerEmAsyncify::initTypesAndFunctions(void) {
   // Data types
   Void = Type::getVoidTy(TheModule->getContext());
   I1 = Type::getInt1Ty(TheModule->getContext());
   I32 = Type::getInt32Ty(TheModule->getContext());
   I32Ptr = Type::getInt32PtrTy(TheModule->getContext());

   // Function types
   SmallVector<Type*, 2> ArgTypes;
   VFunction = FunctionType::get(Void, false);
   I1Function = FunctionType::get(I1, false);
   I32PFunction = FunctionType::get(I32Ptr, false);

   ArgTypes.clear();
   ArgTypes.push_back(I32Ptr);
   VI32PFunction = FunctionType::get(Void, ArgTypes, false);

   ArgTypes.clear();
   ArgTypes.push_back(I32);
   I32PI32Function = FunctionType::get(I32Ptr, ArgTypes, false);

   CallbackFunctionType = VI32PFunction;

   // Functions
   CheckAsyncFunction = Function::Create(
     I1Function,
     GlobalValue::ExternalLinkage,
     "emscripten_check_async",
     TheModule
   );

   AllocAsyncCtxFunction = Function::Create(
     I32PI32Function,
     GlobalValue::ExternalLinkage,
     "emscripten_alloc_async_context",
     TheModule
   );

   ReallocAsyncCtxFunction = Function::Create(
     I32PI32Function,
     GlobalValue::ExternalLinkage,
     "emscripten_realloc_async_context",
     TheModule
   );

   FreeAsyncCtxFunction = Function::Create(
     VI32PFunction,
     GlobalValue::ExternalLinkage,
     "emscripten_free_async_context",
     TheModule
   );

   DoNotUnwindFunction = Function::Create(
     VFunction,
     GlobalValue::ExternalLinkage,
     "emscripten_do_not_unwind",
     TheModule
   );

   DoNotUnwindAsyncFunction = Function::Create(
     VFunction,
     GlobalValue::ExternalLinkage,
     "emscripten_do_not_unwind_async",
     TheModule
   );

   GetAsyncReturnValueAddrFunction = Function::Create(
     I32PFunction,
     GlobalValue::ExternalLinkage,
     "emscripten_get_async_return_value_addr",
     TheModule
   );
 }

 LowerEmAsyncify::BasicBlockSet LowerEmAsyncify::FindReachableBlocksFrom(BasicBlock *src) {
   BasicBlockSet ReachableBlockSet;
   std::vector<BasicBlock*> pending;
   ReachableBlockSet.insert(src);
   pending.push_back(src);
   while (!pending.empty()) {
     BasicBlock *CurBlock = pending.back();
     pending.pop_back();
     for (succ_iterator SI = succ_begin(CurBlock), SE = succ_end(CurBlock); SI != SE; ++SI) {
       if (ReachableBlockSet.count(*SI) == 0) {
         ReachableBlockSet.insert(*SI);
         pending.push_back(*SI);
       }
     }
   }
   return ReachableBlockSet;
 }

 void LowerEmAsyncify::FindContextVariables(AsyncCallEntry & Entry) {
   BasicBlock *AfterCallBlock = Entry.AfterCallBlock;

   Function & F = *AfterCallBlock->getParent();

   // Create a new entry block as if in the callback function
   // theck check variables that no longer properly dominate their uses
   BasicBlock *EntryBlock = BasicBlock::Create(TheModule->getContext(), "", &F, &F.getEntryBlock());
   BranchInst::Create(AfterCallBlock, EntryBlock);

   DominatorTreeWrapperPass DTW;
   DTW.runOnFunction(F);
   DominatorTree& DT = DTW.getDomTree();

   // These blocks may be using some values defined at or before AsyncCallBlock
   BasicBlockSet Ramifications = FindReachableBlocksFrom(AfterCallBlock);

   SmallPtrSet<Value*, 256> ContextVariables;
   Values Pending;

   // Examine the instructions, find all variables that we need to store in the context
   for (BasicBlockSet::iterator RI = Ramifications.begin(), RE = Ramifications.end(); RI != RE; ++RI) {
     for (BasicBlock::iterator I = (*RI)->begin(), E = (*RI)->end(); I != E; ++I) {
       for (unsigned i = 0, NumOperands = I->getNumOperands(); i < NumOperands; ++i) {
         Value *O = I->getOperand(i);
         if (Instruction *Inst = dyn_cast<Instruction>(O)) {
           if (Inst == Entry.AsyncCallInst) continue; // for the original async call, we will load directly from async return value
           if (ContextVariables.count(Inst) != 0)  continue; // already examined

           if (!DT.dominates(Inst, I->getOperandUse(i))) {
             // `I` is using `Inst`, yet `Inst` does not dominate `I` if we arrive directly at AfterCallBlock
             // so we need to save `Inst` in the context
             ContextVariables.insert(Inst);
             Pending.push_back(Inst);
           }
         } else if (Argument *Arg = dyn_cast<Argument>(O)) {
           // count() should be as fast/slow as insert, so just insert here
           ContextVariables.insert(Arg);
         }
       }
     }
   }

   // restore F
   EntryBlock->eraseFromParent();

   Entry.ContextVariables.clear();
   Entry.ContextVariables.reserve(ContextVariables.size());
   for (SmallPtrSet<Value*, 256>::iterator I = ContextVariables.begin(), E = ContextVariables.end(); I != E; ++I) {
     Entry.ContextVariables.push_back(*I);
   }
 }

 /*
  * Consider that F contains a call to G, both of which are async:
  *
  * function F:
  * ...
  * %0 = G(%1, %2, ...);
  * ...
  * return %%;
  *
  * We want to convert F and generate F__asyn_cb
  * they are similar, but with minor yet important differences
  * Note those `main func only` and `callback func only` instructions

 //////////////////////////////////////////////////////////
   function F:
   ...
   ctx = alloc_ctx(len, sp); // main func only
                          // TODO
                          // we could also do this only after an async call
                          // but in that case we will need to pass ctx to the function
                          // since ctx is no longer in the top async stack frame
   %0 = G(%1, %2, ...);
   if (async) { // G was async
     save context variables in ctx
     register F.async_cb as the callback in frame
     return without unwinding the stack frame
   } else { // G was sync
     // use %0 as normal
     free_ctx(ctx); // main func only
     // ctx is freed here, because so far F is still a sync function
     // and we don't want any side effects
     ...
     async return value = %%;
     return & normally unwind the stack frame // main func only
   }
 //////////////////////////////////////////////////////////

  * And here's F.async_cb

 //////////////////////////////////////////////////////////
   function F.async_cb(ctx):
   load variables from ctx // callback func only
   goto resume_point;      // callback func only
   ...
   ctx = realloc_ctx(len); // callback func only
                           // realloc_ctx is different from alloc_ctx
                           // which reused the current async stack frame
                           // we want to keep the saved stack pointer
   %0 = G(%1, %2, ...);
   if (async) {
     save context variables in ctx
     register F.async_cb as the callback
     return without unwinding the stack frame
   } else {
     resume_point:
     %0'= either $0 or the async return value // callback func only
     ...
     async return value = %%
     return restore the stack pointer back to the value stored in F // callback func only
     // no need to free the ctx
     // the scheduler will be aware of this return and handle the stack frames
   }
 //////////////////////////////////////////////////////////

  */

 void LowerEmAsyncify::transformAsyncFunction(Function &F, Instructions const& AsyncCalls) {
   assert(!AsyncCalls.empty());

   // Pass 0
   // collect all the return instructions from the original function
   // will use later
   std::vector<ReturnInst*> OrigReturns;
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
     if (ReturnInst *RI = dyn_cast<ReturnInst>(&*I)) {
       OrigReturns.push_back(RI);
     }
   }

   // Pass 1
   // Scan each async call and make the basic structure:
   // All these will be cloned into the callback functions
   // - allocate the async context before calling an async function
   // - check async right after calling an async function, save context & return if async, continue if not
   // - retrieve the async return value and free the async context if the called function turns out to be sync
   std::vector<AsyncCallEntry> AsyncCallEntries;
   AsyncCallEntries.reserve(AsyncCalls.size());
   for (Instructions::const_iterator I = AsyncCalls.begin(), E = AsyncCalls.end(); I != E; ++I) {
     // prepare blocks
     Instruction *CurAsyncCall = *I;

     // The block containing the async call
     BasicBlock *CurBlock = CurAsyncCall->getParent();
     // The block should run after the async call
     BasicBlock *AfterCallBlock = SplitBlock(CurBlock, CurAsyncCall->getNextNode());
     // The block where we store the context and return
     BasicBlock *SaveAsyncCtxBlock = BasicBlock::Create(TheModule->getContext(), "SaveAsyncCtx", &F, AfterCallBlock);
     // return a dummy value at the end, to make the block valid
     new UnreachableInst(TheModule->getContext(), SaveAsyncCtxBlock);

     // allocate the context before making the call
     // we don't know the size yet, will fix it later
     // we cannot insert the instruction later because,
     // we need to make sure that all the instructions and blocks are fixed before we can generate DT and find context variables
     // In CallHandler.h `sp` will be put as the second parameter
     // such that we can take a note of the original sp
     CallInst *AllocAsyncCtxInst = CallInst::Create(AllocAsyncCtxFunction, Constant::getNullValue(I32), "AsyncCtx", CurAsyncCall);

     // Right after the call
     // check async and return if so
     // TODO: we can define truly async functions and partial async functions
     {
       // remove old terminator, which came from SplitBlock
       CurBlock->getTerminator()->eraseFromParent();
       // go to SaveAsyncCtxBlock if the previous call is async
       // otherwise just continue to AfterCallBlock
       CallInst *CheckAsync = CallInst::Create(CheckAsyncFunction, "IsAsync", CurBlock);
       BranchInst::Create(SaveAsyncCtxBlock, AfterCallBlock, CheckAsync, CurBlock);
     }

     // take a note of this async call
     AsyncCallEntry CurAsyncCallEntry;
     CurAsyncCallEntry.AsyncCallInst = CurAsyncCall;
     CurAsyncCallEntry.AfterCallBlock = AfterCallBlock;
     CurAsyncCallEntry.AllocAsyncCtxInst = AllocAsyncCtxInst;
     CurAsyncCallEntry.SaveAsyncCtxBlock = SaveAsyncCtxBlock;
     // create an empty function for the callback, which will be constructed later
     CurAsyncCallEntry.CallbackFunc = Function::Create(CallbackFunctionType, F.getLinkage(), F.getName() + "__async_cb", TheModule);
     AsyncCallEntries.push_back(CurAsyncCallEntry);
   }


   // Pass 2
   // analyze the context variables and construct SaveAsyncCtxBlock for each async call
   // also calculate the size of the context and allocate the async context accordingly
   for (std::vector<AsyncCallEntry>::iterator EI = AsyncCallEntries.begin(), EE = AsyncCallEntries.end();  EI != EE; ++EI) {
     AsyncCallEntry & CurEntry = *EI;

     // Collect everything to be saved
     FindContextVariables(CurEntry);

     // Pack the variables as a struct
     {
       // TODO: sort them from large memeber to small ones, in order to make the struct compact even when aligned
       SmallVector<Type*, 8> Types;
       Types.push_back(CallbackFunctionType->getPointerTo());
       for (Values::iterator VI = CurEntry.ContextVariables.begin(), VE = CurEntry.ContextVariables.end(); VI != VE; ++VI) {
         Types.push_back((*VI)->getType());
       }
       CurEntry.ContextStructType = StructType::get(TheModule->getContext(), Types);
     }

     // fix the size of allocation
     CurEntry.AllocAsyncCtxInst->setOperand(0,
         ConstantInt::get(I32, DL->getTypeStoreSize(CurEntry.ContextStructType)));

     // construct SaveAsyncCtxBlock
     {
       // fill in SaveAsyncCtxBlock
       // temporarily remove the terminator for convenience
       CurEntry.SaveAsyncCtxBlock->getTerminator()->eraseFromParent();
       assert(CurEntry.SaveAsyncCtxBlock->empty());

       Type *AsyncCtxAddrTy = CurEntry.ContextStructType->getPointerTo();
       BitCastInst *AsyncCtxAddr = new BitCastInst(CurEntry.AllocAsyncCtxInst, AsyncCtxAddrTy, "AsyncCtxAddr", CurEntry.SaveAsyncCtxBlock);
       SmallVector<Value*, 2> Indices;
       // store the callback
       {
         Indices.push_back(ConstantInt::get(I32, 0));
         Indices.push_back(ConstantInt::get(I32, 0));
         GetElementPtrInst *AsyncVarAddr = GetElementPtrInst::Create(CurEntry.ContextStructType, AsyncCtxAddr, Indices, "", CurEntry.SaveAsyncCtxBlock);
         new StoreInst(CurEntry.CallbackFunc, AsyncVarAddr, CurEntry.SaveAsyncCtxBlock);
       }
       // store the context variables
       for (size_t i = 0; i < CurEntry.ContextVariables.size(); ++i) {
         Indices.clear();
         Indices.push_back(ConstantInt::get(I32, 0));
         Indices.push_back(ConstantInt::get(I32, i + 1)); // the 0th element is the callback function
         GetElementPtrInst *AsyncVarAddr = GetElementPtrInst::Create(CurEntry.ContextStructType, AsyncCtxAddr, Indices, "", CurEntry.SaveAsyncCtxBlock);
         new StoreInst(CurEntry.ContextVariables[i], AsyncVarAddr, CurEntry.SaveAsyncCtxBlock);
       }
       // to exit the block, we want to return without unwinding the stack frame
       CallInst::Create(DoNotUnwindFunction, "", CurEntry.SaveAsyncCtxBlock);
       ReturnInst::Create(TheModule->getContext(),
           (F.getReturnType()->isVoidTy() ? 0 : Constant::getNullValue(F.getReturnType())),
           CurEntry.SaveAsyncCtxBlock);
     }
   }

   // Pass 3
   // now all the SaveAsyncCtxBlock's have been constructed
   // we can clone F and construct callback functions
   // we could not construct the callbacks in Pass 2 because we need _all_ those SaveAsyncCtxBlock's appear in _each_ callback
   for (std::vector<AsyncCallEntry>::iterator EI = AsyncCallEntries.begin(), EE = AsyncCallEntries.end();  EI != EE; ++EI) {
     AsyncCallEntry & CurEntry = *EI;

     Function *CurCallbackFunc = CurEntry.CallbackFunc;
     ValueToValueMapTy VMap;

     // Add the entry block
     // load variables from the context
     // also update VMap for CloneFunction
     BasicBlock *EntryBlock = BasicBlock::Create(TheModule->getContext(), "AsyncCallbackEntry", CurCallbackFunc);
     std::vector<LoadInst *> LoadedAsyncVars;
     {
       Type *AsyncCtxAddrTy = CurEntry.ContextStructType->getPointerTo();
       BitCastInst *AsyncCtxAddr = new BitCastInst(CurCallbackFunc->arg_begin(), AsyncCtxAddrTy, "AsyncCtx", EntryBlock);
       SmallVector<Value*, 2> Indices;
       for (size_t i = 0; i < CurEntry.ContextVariables.size(); ++i) {
         Indices.clear();
         Indices.push_back(ConstantInt::get(I32, 0));
         Indices.push_back(ConstantInt::get(I32, i + 1)); // the 0th element of AsyncCtx is the callback function
         GetElementPtrInst *AsyncVarAddr = GetElementPtrInst::Create(CurEntry.ContextStructType, AsyncCtxAddr, Indices, "", EntryBlock);
         LoadedAsyncVars.push_back(new LoadInst(AsyncVarAddr, "", EntryBlock));
         // we want the argument to be replaced by the loaded value
         if (isa<Argument>(CurEntry.ContextVariables[i]))
           VMap[CurEntry.ContextVariables[i]] = LoadedAsyncVars.back();
       }
     }

     // we don't need any argument, just leave dummy entries there to cheat CloneFunctionInto
     for (Function::const_arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI) {
       if (VMap.count(AI) == 0)
         VMap[AI] = Constant::getNullValue(AI->getType());
     }

     // Clone the function
     {
       SmallVector<ReturnInst*, 8> Returns;
       CloneFunctionInto(CurCallbackFunc, &F, VMap, false, Returns);

       // return type of the callback functions is always void
       // need to fix the return type
       if (!F.getReturnType()->isVoidTy()) {
         // for those return instructions that are from the original function
         // it means we are 'truly' leaving this function
         // need to store the return value right before ruturn
         for (size_t i = 0; i < OrigReturns.size(); ++i) {
           ReturnInst *RI = cast<ReturnInst>(VMap[OrigReturns[i]]);
           // Need to store the return value into the global area
           CallInst *RawRetValAddr = CallInst::Create(GetAsyncReturnValueAddrFunction, "", RI);
           BitCastInst *RetValAddr = new BitCastInst(RawRetValAddr, F.getReturnType()->getPointerTo(), "AsyncRetValAddr", RI);
           new StoreInst(RI->getOperand(0), RetValAddr, RI);
         }
         // we want to unwind the stack back to where it was before the original function as called
         // but we don't actually need to do this here
         // at this point it must be true that no callback is pended
         // so the scheduler will correct the stack pointer and pop the frame
         // here we just fix the return type
         for (size_t i = 0; i < Returns.size(); ++i) {
           ReplaceInstWithInst(Returns[i], ReturnInst::Create(TheModule->getContext()));
         }
       }
     }

     // the callback function does not have any return value
     // so clear all the attributes for return
     {
       AttributeSet Attrs = CurCallbackFunc->getAttributes();
       CurCallbackFunc->setAttributes(
         Attrs.removeAttributes(TheModule->getContext(), AttributeSet::ReturnIndex, Attrs.getRetAttributes())
       );
     }

     // in the callback function, we never allocate a new async frame
     // instead we reuse the existing one
     for (std::vector<AsyncCallEntry>::iterator EI = AsyncCallEntries.begin(), EE = AsyncCallEntries.end();  EI != EE; ++EI) {
       Instruction *I = cast<Instruction>(VMap[EI->AllocAsyncCtxInst]);
       ReplaceInstWithInst(I, CallInst::Create(ReallocAsyncCtxFunction, I->getOperand(0), "ReallocAsyncCtx"));
     }

     // mapped entry point & async call
     BasicBlock *ResumeBlock = cast<BasicBlock>(VMap[CurEntry.AfterCallBlock]);
     Instruction *MappedAsyncCall = cast<Instruction>(VMap[CurEntry.AsyncCallInst]);

     // To save space, for each async call in the callback function, we just ignore the sync case, and leave it to the scheduler
     // TODO need an option for this
     {
       for (std::vector<AsyncCallEntry>::iterator EI = AsyncCallEntries.begin(), EE = AsyncCallEntries.end();  EI != EE; ++EI) {
         AsyncCallEntry & CurEntry = *EI;
         Instruction *MappedAsyncCallInst = cast<Instruction>(VMap[CurEntry.AsyncCallInst]);
         BasicBlock *MappedAsyncCallBlock = MappedAsyncCallInst->getParent();
         BasicBlock *MappedAfterCallBlock = cast<BasicBlock>(VMap[CurEntry.AfterCallBlock]);

         // for the sync case of the call, go to NewBlock (instead of MappedAfterCallBlock)
         BasicBlock *NewBlock = BasicBlock::Create(TheModule->getContext(), "", CurCallbackFunc, MappedAfterCallBlock);
         MappedAsyncCallBlock->getTerminator()->setSuccessor(1, NewBlock);
         // store the return value
         if (!MappedAsyncCallInst->use_empty()) {
           CallInst *RawRetValAddr = CallInst::Create(GetAsyncReturnValueAddrFunction, "", NewBlock);
           BitCastInst *RetValAddr = new BitCastInst(RawRetValAddr, MappedAsyncCallInst->getType()->getPointerTo(), "AsyncRetValAddr", NewBlock);
           new StoreInst(MappedAsyncCallInst, RetValAddr, NewBlock);
         }
         // tell the scheduler that we want to keep the current async stack frame
         CallInst::Create(DoNotUnwindAsyncFunction, "", NewBlock);
         // finally we go to the SaveAsyncCtxBlock, to register the callbac, save the local variables and leave
         BasicBlock *MappedSaveAsyncCtxBlock = cast<BasicBlock>(VMap[CurEntry.SaveAsyncCtxBlock]);
         BranchInst::Create(MappedSaveAsyncCtxBlock, NewBlock);
       }
     }

     std::vector<AllocaInst*> ToPromote;
     // applying loaded variables in the entry block
     {
       BasicBlockSet ReachableBlocks = FindReachableBlocksFrom(ResumeBlock);
       for (size_t i = 0; i < CurEntry.ContextVariables.size(); ++i) {
         Value *OrigVar = CurEntry.ContextVariables[i];
         if (isa<Argument>(OrigVar)) continue; // already processed
         Value *CurVar = VMap[OrigVar];
         assert(CurVar != MappedAsyncCall);
         if (Instruction *Inst = dyn_cast<Instruction>(CurVar)) {
           if (ReachableBlocks.count(Inst->getParent())) {
             // Inst could be either defined or loaded from the async context
             // Do the dirty works in memory
             // TODO: might need to check the safety first
             // TODO: can we create phi directly?
             AllocaInst *Addr = DemoteRegToStack(*Inst, false);
             new StoreInst(LoadedAsyncVars[i], Addr, EntryBlock);
             ToPromote.push_back(Addr);
           } else {
             // The parent block is not reachable, which means there is no confliction
             // it's safe to replace Inst with the loaded value
             assert(Inst != LoadedAsyncVars[i]); // this should only happen when OrigVar is an Argument
             Inst->replaceAllUsesWith(LoadedAsyncVars[i]);
           }
         }
       }
     }

     // resolve the return value of the previous async function
     // it could be the value just loaded from the global area
     // or directly returned by the function (in its sync case)
     if (!CurEntry.AsyncCallInst->use_empty()) {
       // load the async return value
       CallInst *RawRetValAddr = CallInst::Create(GetAsyncReturnValueAddrFunction, "", EntryBlock);
       BitCastInst *RetValAddr = new BitCastInst(RawRetValAddr, MappedAsyncCall->getType()->getPointerTo(), "AsyncRetValAddr", EntryBlock);
       LoadInst *RetVal = new LoadInst(RetValAddr, "AsyncRetVal", EntryBlock);

       AllocaInst *Addr = DemoteRegToStack(*MappedAsyncCall, false);
       new StoreInst(RetVal, Addr, EntryBlock);
       ToPromote.push_back(Addr);
     }

     // TODO remove unreachable blocks before creating phi

     // We go right to ResumeBlock from the EntryBlock
     BranchInst::Create(ResumeBlock, EntryBlock);

     /*
      * Creating phi's
      * Normal stack frames and async stack frames are interleaving with each other.
      * In a callback function, if we call an async function, we might need to realloc the async ctx.
      * at this point we don't want anything stored after the ctx,
      * such that we can free and extend the ctx by simply update STACKTOP.
      * Therefore we don't want any alloca's in callback functions.
      *
      */
     if (!ToPromote.empty()) {
       DominatorTreeWrapperPass DTW;
       DTW.runOnFunction(*CurCallbackFunc);
       PromoteMemToReg(ToPromote, DTW.getDomTree());
     }

     removeUnreachableBlocks(*CurCallbackFunc);
   }

   // Pass 4
   // Here are modifications to the original function, which we won't want to be cloned into the callback functions
   for (std::vector<AsyncCallEntry>::iterator EI = AsyncCallEntries.begin(), EE = AsyncCallEntries.end();  EI != EE; ++EI) {
     AsyncCallEntry & CurEntry = *EI;
     // remove the frame if no async functinon has been called
     CallInst::Create(FreeAsyncCtxFunction, CurEntry.AllocAsyncCtxInst, "", CurEntry.AfterCallBlock->getFirstNonPHI());
   }
 }

 bool LowerEmAsyncify::IsFunctionPointerCall(const Instruction *I) {
   // mostly from CallHandler.h
   ImmutableCallSite CS(I);
   if (!CS) return false; // not call nor invoke
   const Value *CV = CS.getCalledValue()->stripPointerCasts();
   return !isa<const Function>(CV);
 }

 ModulePass *llvm::createLowerEmAsyncifyPass() {
   return new LowerEmAsyncify();
 }