Publications

Kevin Lin, Kyle Lo, Joseph E. Gonzalez, and Dan Klein. "Decomposing Complex Queries for Tip-of-the-tongue Retrieval." arXiv, 2023.

When re-finding items, users who forget or are uncertain about identifying details often rely on creative strategies for expressing their information needs -- complex queries that describe content elements (e.g., book characters or events), information beyond the document text (e.g., descriptions of book covers), or personal context (e.g., when they read a book). This retrieval setting, called tip of the tongue (TOT), is especially challenging for models heavily reliant on lexical and semantic overlap between query and document text. In this work, we introduce a simple yet effective framework for handling such complex queries by decomposing the query into individual clues, routing those as sub-queries to specialized retrievers, and ensembling the results. This approach allows us to take advantage of off-the-shelf retrievers (e.g., CLIP for retrieving images of book covers) or incorporate retriever-specific logic (e.g., date constraints). We show that our framework incorportating query decompositions into retrievers can improve gold book recall up to 7\% relative again for Recall\@5 on a new collection of 14,441 real-world query-book pairs from an online community for resolving TOT inquiries.

@misc{lin2023decomposing,
 abstract = {When re-finding items, users who forget or are uncertain about identifying details often rely on creative strategies for expressing their information needs -- complex queries that describe content elements (e.g., book characters or events), information beyond the document text (e.g., descriptions of book covers), or personal context (e.g., when they read a book). This retrieval setting, called tip of the tongue (TOT), is especially challenging for models heavily reliant on lexical and semantic overlap between query and document text. In this work, we introduce a simple yet effective framework for handling such complex queries by decomposing the query into individual clues, routing those as sub-queries to specialized retrievers, and ensembling the results. This approach allows us to take advantage of off-the-shelf retrievers (e.g., CLIP for retrieving images of book covers) or incorporate retriever-specific logic (e.g., date constraints). We show that our framework incorportating query decompositions into retrievers can improve gold book recall up to 7\% relative again for Recall\@5 on a new collection of 14,441 real-world query-book pairs from an online community for resolving TOT inquiries.},
 archiveprefix = {arXiv},
 author = {Kevin Lin and Kyle Lo and Joseph E. Gonzalez and Dan Klein},
 eprint = {2305.15053},
 keywords = {arxivpre},
 primaryclass = {cs.CL},
 title = {Decomposing Complex Queries for Tip-of-the-tongue Retrieval},
 url = {https://arxiv.org/abs/2305.15053},
 year = {2023}
}

Lisa Dunlap, Alyssa Umino, Han Zhang, Jiezhi Yang, Joseph E. Gonzalez, and Trevor Darrell. "Diversify Your Vision Datasets with Automatic Diffusion-Based Augmentation." arXiv, 2023.

Many fine-grained classification tasks, like rare animal identification, have limited training data and consequently classifiers trained on these datasets often fail to generalize to variations in the domain like changes in weather or location. As such, we explore how natural language descriptions of the domains seen in training data can be used with large vision models trained on diverse pretraining datasets to generate useful variations of the training data. We introduce ALIA (Automated Language-guided Image Augmentation), a method which utilizes large vision and language models to automatically generate natural language descriptions of a dataset's domains and augment the training data via language-guided image editing. To maintain data integrity, a model trained on the original dataset filters out minimal image edits and those which corrupt class-relevant information. The resulting dataset is visually consistent with the original training data and offers significantly enhanced diversity. On fine-grained and cluttered datasets for classification and detection, ALIA surpasses traditional data augmentation and text-to-image generated data by up to 15\%, often even outperforming equivalent additions of real data.

@misc{dunlap2023diversify,
 abstract = {Many fine-grained classification tasks, like rare animal identification, have limited training data and consequently classifiers trained on these datasets often fail to generalize to variations in the domain like changes in weather or location. As such, we explore how natural language descriptions of the domains seen in training data can be used with large vision models trained on diverse pretraining datasets to generate useful variations of the training data. We introduce ALIA (Automated Language-guided Image Augmentation), a method which utilizes large vision and language models to automatically generate natural language descriptions of a dataset's domains and augment the training data via language-guided image editing. To maintain data integrity, a model trained on the original dataset filters out minimal image edits and those which corrupt class-relevant information. The resulting dataset is visually consistent with the original training data and offers significantly enhanced diversity. On fine-grained and cluttered datasets for classification and detection, ALIA surpasses traditional data augmentation and text-to-image generated data by up to 15\%, often even outperforming equivalent additions of real data.},
 archiveprefix = {arXiv},
 author = {Lisa Dunlap and Alyssa Umino and Han Zhang and Jiezhi Yang and Joseph E. Gonzalez and Trevor Darrell},
 code = {https://github.com/lisadunlap/ALIA},
 eprint = {2305.16289},
 keywords = {arxivpre},
 primaryclass = {cs.CV},
 title = {Diversify Your Vision Datasets with Automatic Diffusion-Based Augmentation},
 url = {https://arxiv.org/abs/2305.16289},
 year = {2023}
}

Shishir G. Patil, Tianjun Zhang, Xin Wang, and Joseph E. Gonzalez. "Gorilla: Large Language Model Connected with Massive APIs." arXiv, 2023.

Large Language Models (LLMs) have seen an impressive wave of advances recently, with models now excelling in a variety of tasks, such as mathematical reasoning and program synthesis. However, their potential to effectively use tools via API calls remains unfulfilled. This is a challenging task even for today's state-of-the-art LLMs such as GPT-4, largely due to their inability to generate accurate input arguments and their tendency to hallucinate the wrong usage of an API call. We release Gorilla, a finetuned LLaMA-based model that surpasses the performance of GPT-4 on writing API calls. When combined with a document retriever, Gorilla demonstrates a strong capability to adapt to test-time document changes, enabling flexible user updates or version changes. It also substantially mitigates the issue of hallucination, commonly encountered when prompting LLMs directly. To evaluate the model's ability, we introduce APIBench, a comprehensive dataset consisting of HuggingFace, TorchHub, and TensorHub APIs. The successful integration of the retrieval system with Gorilla demonstrates the potential for LLMs to use tools more accurately, keep up with frequently updated documentation, and consequently increase the reliability and applicability of their outputs.

@misc{patil2023gorilla,
 abstract = {Large Language Models (LLMs) have seen an impressive wave of advances recently, with models now excelling in a variety of tasks, such as mathematical reasoning and program synthesis. However, their potential to effectively use tools via API calls remains unfulfilled. This is a challenging task even for today's state-of-the-art LLMs such as GPT-4, largely due to their inability to generate accurate input arguments and their tendency to hallucinate the wrong usage of an API call. We release Gorilla, a finetuned LLaMA-based model that surpasses the performance of GPT-4 on writing API calls. When combined with a document retriever, Gorilla demonstrates a strong capability to adapt to test-time document changes, enabling flexible user updates or version changes. It also substantially mitigates the issue of hallucination, commonly encountered when prompting LLMs directly. To evaluate the model's ability, we introduce APIBench, a comprehensive dataset consisting of HuggingFace, TorchHub, and TensorHub APIs. The successful integration of the retrieval system with Gorilla demonstrates the potential for LLMs to use tools more accurately, keep up with frequently updated documentation, and consequently increase the reliability and applicability of their outputs.},
 archiveprefix = {arXiv},
 author = {Shishir G. Patil and Tianjun Zhang and Xin Wang and Joseph E. Gonzalez},
 code = {https://gorilla.cs.berkeley.edu},
 eprint = {2305.15334},
 keywords = {arxivpre},
 primaryclass = {cs.CL},
 title = {Gorilla: Large Language Model Connected with Massive APIs},
 url = {https://arxiv.org/abs/2305.15334},
 year = {2023}
}

Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric. P Xing, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena." arXiv, 2023.

Evaluating large language model (LLM) based chat assistants is challenging due to their broad capabilities and the inadequacy of existing benchmarks in measuring human preferences. To address this, we explore using strong LLMs as judges to evaluate these models on more open-ended questions. We examine the usage and limitations of LLM-as-a-judge, such as position and verbosity biases and limited reasoning ability, and propose solutions to migrate some of them. We then verify the agreement between LLM judges and human preferences by introducing two benchmarks: MT-bench, a multi-turn question set; and Chatbot Arena, a crowdsourced battle platform. Our results reveal that strong LLM judges like GPT-4 can match both controlled and crowdsourced human preferences well, achieving over 80\% agreement, the same level of agreement between humans. Hence, LLM-as-a-judge is a scalable and explainable way to approximate human preferences, which are otherwise very expensive to obtain. Additionally, we show our benchmark and traditional benchmarks complement each other by evaluating several variants of LLaMA/Vicuna. We will publicly release 80 MT-bench questions, 3K expert votes, and 30K conversations with human preferences from Chatbot Arena.

@misc{zheng2023judging,
 abstract = {Evaluating large language model (LLM) based chat assistants is challenging due to their broad capabilities and the inadequacy of existing benchmarks in measuring human preferences. To address this, we explore using strong LLMs as judges to evaluate these models on more open-ended questions. We examine the usage and limitations of LLM-as-a-judge, such as position and verbosity biases and limited reasoning ability, and propose solutions to migrate some of them. We then verify the agreement between LLM judges and human preferences by introducing two benchmarks: MT-bench, a multi-turn question set; and Chatbot Arena, a crowdsourced battle platform. Our results reveal that strong LLM judges like GPT-4 can match both controlled and crowdsourced human preferences well, achieving over 80\% agreement, the same level of agreement between humans. Hence, LLM-as-a-judge is a scalable and explainable way to approximate human preferences, which are otherwise very expensive to obtain. Additionally, we show our benchmark and traditional benchmarks complement each other by evaluating several variants of LLaMA/Vicuna. We will publicly release 80 MT-bench questions, 3K expert votes, and 30K conversations with human preferences from Chatbot Arena.},
 archiveprefix = {arXiv},
 author = {Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
 code = {https://github.com/lmsys/fastchat},
 eprint = {2306.05685},
 keywords = {arxivpre},
 primaryclass = {cs.CL},
 title = {Judging LLM-as-a-judge with MT-Bench and Chatbot Arena},
 url = {https://arxiv.org/abs/2204.05149},
 year = {2023}
}

Yonghao Zhuang, Lianmin Zheng, Zhuohan Li, Eric Xing, Qirong Ho, Joseph E. Gonzalez, Ion Stoica, Hao Zhang, and Hexu Zhao. "On Optimizing the Communication of Model Parallelism." Proceedings of Machine Learning and Systems, 2023.

We study a novel and important communication pattern in large-scale model-parallel deep learning (DL), which we call cross-mesh resharding. This pattern emerges when the two paradigms of model parallelism – intra-operator and inter-operator parallelism – are combined to support large models on large clusters. In cross-mesh resharding, a sharded tensor needs to be sent from a source device mesh to a destination device mesh, on which the tensor may be distributed with the same or different layouts. We formalize this as a many-to-many multicast communication problem, and show that existing approaches either are sub-optimal or do not generalize to different network topologies or tensor layouts, which result from different model architectures and parallelism strategies. We then propose two contributions to address cross-mesh resharding: an efficient broadcast-based communication system, and an "overlapping-friendly" pipeline schedule. On microbenchmarks, our overall system outperforms existing ones by up to 10x across various tensor and mesh layouts. On end-to-end training of two large models, GPT-3 and U-Transformer, we improve throughput by 10\% and 50\%, respectively.

@inproceedings{Zhuang23,
 abstract = {We study a novel and important communication pattern in large-scale model-parallel deep learning (DL), which we call cross-mesh resharding. This pattern emerges when the two paradigms of model parallelism – intra-operator and inter-operator parallelism – are combined to support large models on large clusters. In cross-mesh resharding, a sharded tensor needs to be sent from a source device mesh to a destination device mesh, on which the tensor may be distributed with the same or different layouts. We formalize this as a many-to-many multicast communication problem, and show that existing approaches either are sub-optimal or do not generalize to different network topologies or tensor layouts, which result from different model architectures and parallelism strategies. We then propose two contributions to address cross-mesh resharding: an efficient broadcast-based communication system, and an "overlapping-friendly" pipeline schedule. On microbenchmarks, our overall system outperforms existing ones by up to 10x across various tensor and mesh layouts. On end-to-end training of two large models, GPT-3 and U-Transformer, we improve throughput by 10\% and 50\%, respectively.},
 author = {Zhuang, Yonghao and Zheng, Lianmin and Li, Zhuohan and Xing, Eric and Ho, Qirong and Gonzalez, Joseph E. and Stoica , Ion and Zhang, Hao and Zhao, Hexu},
 booktitle = {Proceedings of Machine Learning and Systems},
 keywords = {peerrev, selected},
 title = {On Optimizing the Communication of Model Parallelism},
 url = {https://proceedings.mlsys.org/paper_files/paper/2023/hash/d0b9a3081f811b2a307c38ad457a487c-Abstract-mlsys2023.html},
 year = {2023}
}

Suzanne Petryk, Spencer Whitehead, Joseph E. Gonzalez, Trevor Darrell, Anna Rohrbach, and Marcus Rohrbach. "Simple Token-Level Confidence Improves Caption Correctness." arXiv, 2023.

The ability to judge whether a caption correctly describes an image is a critical part of vision-language understanding. However, state-of-the-art models often misinterpret the correctness of fine-grained details, leading to errors in outputs such as hallucinating objects in generated captions or poor compositional reasoning. In this work, we explore Token-Level Confidence, or TLC, as a simple yet surprisingly effective method to assess caption correctness. Specifically, we fine-tune a vision-language model on image captioning, input an image and proposed caption to the model, and aggregate either algebraic or learned token confidences over words or sequences to estimate image-caption consistency. Compared to sequence-level scores from pretrained models, TLC with algebraic confidence measures achieves a relative improvement in accuracy by 10\% on verb understanding in SVO-Probes and outperforms prior state-of-the-art in image and group scores for compositional reasoning in Winoground by a relative 37\% and 9\%, respectively. When training data are available, a learned confidence estimator provides further improved performance, reducing object hallucination rates in MS COCO Captions by a relative 30\% over the original model and setting a new state-of-the-art.

@misc{petryk2023simple,
 abstract = {The ability to judge whether a caption correctly describes an image is a critical part of vision-language understanding. However, state-of-the-art models often misinterpret the correctness of fine-grained details, leading to errors in outputs such as hallucinating objects in generated captions or poor compositional reasoning. In this work, we explore Token-Level Confidence, or TLC, as a simple yet surprisingly effective method to assess caption correctness. Specifically, we fine-tune a vision-language model on image captioning, input an image and proposed caption to the model, and aggregate either algebraic or learned token confidences over words or sequences to estimate image-caption consistency. Compared to sequence-level scores from pretrained models, TLC with algebraic confidence measures achieves a relative improvement in accuracy by 10\% on verb understanding in SVO-Probes and outperforms prior state-of-the-art in image and group scores for compositional reasoning in Winoground by a relative 37\% and 9\%, respectively. When training data are available, a learned confidence estimator provides further improved performance, reducing object hallucination rates in MS COCO Captions by a relative 30\% over the original model and setting a new state-of-the-art.},
 archiveprefix = {arXiv},
 author = {Suzanne Petryk and Spencer Whitehead and Joseph E. Gonzalez and Trevor Darrell and Anna Rohrbach and Marcus Rohrbach},
 eprint = {2305.07021},
 keywords = {arxivpre},
 primaryclass = {cs.CV},
 title = {Simple Token-Level Confidence Improves Caption Correctness},
 url = {https://arxiv.org/abs/2305.07021},
 year = {2023}
}

Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P. Xing, Joseph E. Gonzalez, and Ion Stoica. "Alpa: Automating Inter- and Intra-Operator Parallelism for Distributed Deep Learning." 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), 2022.

Alpa automates model-parallel training of large deep learning (DL) models by generating execution plans that unify data, operator, and pipeline parallelism. Existing model-parallel training systems either require users to manually create a parallelization plan or automatically generate one from a limited space of model parallelism configurations. They do not suffice to scale out complex DL models on distributed compute devices. Alpa distributes the training of large DL models by viewing parallelisms as two hierarchical levels: inter-operator and intra-operator parallelisms. Based on it, Alpa constructs a new hierarchical space for massive model-parallel execution plans. Alpa designs a number of compilation passes to automatically derive efficient parallel execution plans at each parallelism level. Alpa implements an efficient runtime to orchestrate the two-level parallel execution on distributed compute devices. Our evaluation shows Alpa generates parallelization plans that match or outperform hand-tuned model-parallel training systems even on models they are designed for. Unlike specialized systems, Alpa also generalizes to models with heterogeneous architectures and models without manually-designed plans. Alpa's source code is publicly available at \url{https://github.com/alpa-projects/alpa}

@inproceedings{alpa22,
 abstract = {Alpa automates model-parallel training of large deep learning (DL) models by generating execution plans that unify data, operator, and pipeline parallelism. Existing model-parallel training systems either require users to manually create a parallelization plan or automatically generate one from a limited space of model parallelism configurations. They do not suffice to scale out complex DL models on distributed compute devices. Alpa distributes the training of large DL models by viewing parallelisms as two hierarchical levels: inter-operator and intra-operator parallelisms. Based on it, Alpa constructs a new hierarchical space for massive model-parallel execution plans. Alpa designs a number of compilation passes to automatically derive efficient parallel execution plans at each parallelism level. Alpa implements an efficient runtime to orchestrate the two-level parallel execution on distributed compute devices. Our evaluation shows Alpa generates parallelization plans that match or outperform hand-tuned model-parallel training systems even on models they are designed for. Unlike specialized systems, Alpa also generalizes to models with heterogeneous architectures and models without manually-designed plans. Alpa's source code is publicly available at \url{https://github.com/alpa-projects/alpa} },
 address = {Carlsbad, CA},
 author = {Lianmin Zheng and Zhuohan Li and Hao Zhang and Yonghao Zhuang and Zhifeng Chen and Yanping Huang and Yida Wang and Yuanzhong Xu and Danyang Zhuo and Eric P. Xing and Joseph E. Gonzalez and Ion Stoica},
 bdsk-url-1 = {https://www.usenix.org/conference/osdi22/presentation/zheng-lianmin},
 booktitle = {16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)},
 code = {https://github.com/alpa-projects/alpa},
 isbn = {978-1-939133-28-1},
 keywords = {peerrev, selected},
 month = {7},
 pages = {559--578},
 publisher = {USENIX Association},
 title = {Alpa: Automating Inter- and {Intra-Operator} Parallelism for Distributed Deep Learning},
 url = {https://www.usenix.org/conference/osdi22/presentation/zheng-lianmin},
 year = {2022}
}

Xiaoxuan Liu, Lianmin Zheng, Dequan Wang, Yukuo Cen, Weize Chen, Xu Han, Jianfei Chen, Zhiyuan Liu, Jie Tang, Joseph E. Gonzalez, Michael Mahoney, and Alvin Cheung. "GACT: Activation Compressed Training for Generic Network Architectures." Proceedings of the 39th International Conference on Machine Learning, 2022.

Training large neural network (NN) models requires extensive memory resources, and Activation Compression Training (ACT) is a promising approach to reduce training memory footprint. This paper presents GACT, an ACT framework to support a broad range of machine learning tasks for generic NN architectures with limited domain knowledge. By analyzing a linearized version of ACT's approximate gradient, we prove the convergence of GACT without prior knowledge on operator type or model architecture. To make training stable, we propose an algorithm that decides the compression ratio for each tensor by estimating its impact on the gradient at run time. We implement GACT as a PyTorch library that readily applies to any NN architecture. GACT reduces the activation memory for convolutional NNs, transformers, and graph NNs by up to 8.1x, enabling training with a 4.2x to 24.7x larger batch size, with negligible accuracy loss.

@inproceedings{gact22,
 abstract = {Training large neural network (NN) models requires extensive memory resources, and Activation Compression Training (ACT) is a promising approach to reduce training memory footprint. This paper presents GACT, an ACT framework to support a broad range of machine learning tasks for generic NN architectures with limited domain knowledge. By analyzing a linearized version of ACT's approximate gradient, we prove the convergence of GACT without prior knowledge on operator type or model architecture. To make training stable, we propose an algorithm that decides the compression ratio for each tensor by estimating its impact on the gradient at run time. We implement GACT as a PyTorch library that readily applies to any NN architecture. GACT reduces the activation memory for convolutional NNs, transformers, and graph NNs by up to 8.1x, enabling training with a 4.2x to 24.7x larger batch size, with negligible accuracy loss.},
 author = {Liu, Xiaoxuan and Zheng, Lianmin and Wang, Dequan and Cen, Yukuo and Chen, Weize and Han, Xu and Chen, Jianfei and Liu, Zhiyuan and Tang, Jie and Gonzalez, Joseph E. and Mahoney, Michael and Cheung, Alvin},
 bdsk-url-1 = {https://proceedings.mlr.press/v162/liu22v.html},
 booktitle = {Proceedings of the 39th International Conference on Machine Learning},
 editor = {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
 keywords = {peerrev, selected},
 month = {7},
 pages = {14139--14152},
 pdf = {https://proceedings.mlr.press/v162/liu22v/liu22v.pdf},
 publisher = {PMLR},
 series = {Proceedings of Machine Learning Research},
 title = { {GACT}: Activation Compressed Training for Generic Network Architectures},
 url = {https://proceedings.mlr.press/v162/liu22v.html},
 volume = {162},
 year = {2022}
}

Tianjun Zhang, Tongzheng Ren, Mengjiao Yang, Joseph Gonzalez, Dale Schuurmans, and Bo Dai. "Making Linear MDPs Practical via Contrastive Representation Learning." Proceedings of the 39th International Conference on Machine Learning, 2022.

It is common to address the curse of dimensionality in Markov decision processes (MDPs) by exploiting low-rank representations. This motivates much of the recent theoretical study on linear MDPs. However, most approaches require a given representation under unrealistic assumptions about the normalization of the decomposition or introduce unresolved computational challenges in practice. Instead, we consider an alternative definition of linear MDPs that automatically ensures normalization while allowing efficient representation learning via contrastive estimation. The framework also admits confidence-adjusted index algorithms, enabling an efficient and principled approach to incorporating optimism or pessimism in the face of uncertainty. To the best of our knowledge, this provides the first practical representation learning method for linear MDPs that achieves both strong theoretical guarantees and empirical performance. Theoretically, we prove that the proposed algorithm is sample efficient in both the online and offline settings. Empirically, we demonstrate superior performance over existing state-of-the-art model-based and model-free algorithms on several benchmarks.

@inproceedings{zhang2022contrastive,
 abstract = {It is common to address the curse of dimensionality in Markov decision processes (MDPs) by exploiting low-rank representations. This motivates much of the recent theoretical study on linear MDPs. However, most approaches require a given representation under unrealistic assumptions about the normalization of the decomposition or introduce unresolved computational challenges in practice. Instead, we consider an alternative definition of linear MDPs that automatically ensures normalization while allowing efficient representation learning via contrastive estimation. The framework also admits confidence-adjusted index algorithms, enabling an efficient and principled approach to incorporating optimism or pessimism in the face of uncertainty. To the best of our knowledge, this provides the first practical representation learning method for linear MDPs that achieves both strong theoretical guarantees and empirical performance. Theoretically, we prove that the proposed algorithm is sample efficient in both the online and offline settings. Empirically, we demonstrate superior performance over existing state-of-the-art model-based and model-free algorithms on several benchmarks.},
 author = {Zhang, Tianjun and Ren, Tongzheng and Yang, Mengjiao and Gonzalez, Joseph and Schuurmans, Dale and Dai, Bo},
 bdsk-url-1 = {https://proceedings.mlr.press/v162/zhang22x.html},
 booktitle = {Proceedings of the 39th International Conference on Machine Learning},
 editor = {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
 keywords = {peerrev, selected},
 month = {7},
 pages = {26447--26466},
 pdf = {https://proceedings.mlr.press/v162/zhang22x/zhang22x.pdf},
 publisher = {PMLR},
 series = {Proceedings of Machine Learning Research},
 title = {Making Linear {MDP}s Practical via Contrastive Representation Learning},
 url = {https://proceedings.mlr.press/v162/zhang22x.html},
 volume = {162},
 year = {2022}
}

Zhengming Zhang, Ashwinee Panda, Linyue Song, Yaoqing Yang, Michael Mahoney, Prateek Mittal, Ramchandran Kannan, and Joseph Gonzalez. "Neurotoxin: Durable Backdoors in Federated Learning." Proceedings of the 39th International Conference on Machine Learning, 2022.

Federated learning (FL) systems have an inherent vulnerability to adversarial backdoor attacks during training due to their decentralized nature. The goal of the attacker is to implant backdoors in the learned model with poisoned updates such that at test time, the model's outputs can be fixed to a given target for certain inputs (e.g., if a user types ``people from New York'' into a mobile keyboard app that uses a backdoored next word prediction model, the model will autocomplete their sentence to ``people in New York are rude''). Prior work has shown that backdoors can be inserted in FL, but these backdoors are not durable: they do not remain in the model after the attacker stops uploading poisoned updates because training continues, and in production FL systems an inserted backdoor may not survive until deployment. We propose Neurotoxin, a simple one-line backdoor attack that functions by attacking parameters that are changed less in magnitude during training. We conduct an exhaustive evaluation across ten natural language processing and computer vision tasks and find that we can double the durability of state of the art backdoors by adding a single line with Neurotoxin.

@inproceedings{zhang22w,
 abstract = {Federated learning (FL) systems have an inherent vulnerability to adversarial backdoor attacks during training due to their decentralized nature. The goal of the attacker is to implant backdoors in the learned model with poisoned updates such that at test time, the model's outputs can be fixed to a given target for certain inputs (e.g., if a user types ``people from New York'' into a mobile keyboard app that uses a backdoored next word prediction model, the model will autocomplete their sentence to ``people in New York are rude''). Prior work has shown that backdoors can be inserted in FL, but these backdoors are not durable: they do not remain in the model after the attacker stops uploading poisoned updates because training continues, and in production FL systems an inserted backdoor may not survive until deployment. We propose Neurotoxin, a simple one-line backdoor attack that functions by attacking parameters that are changed less in magnitude during training. We conduct an exhaustive evaluation across ten natural language processing and computer vision tasks and find that we can double the durability of state of the art backdoors by adding a single line with Neurotoxin.},
 author = {Zhang, Zhengming and Panda, Ashwinee and Song, Linyue and Yang, Yaoqing and Mahoney, Michael and Mittal, Prateek and Kannan, Ramchandran and Gonzalez, Joseph},
 bdsk-url-1 = {https://proceedings.mlr.press/v162/zhang22w.html},
 booktitle = {Proceedings of the 39th International Conference on Machine Learning},
 editor = {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
 keywords = {peerrev, selected},
 month = {7},
 pages = {26429--26446},
 pdf = {https://proceedings.mlr.press/v162/zhang22w/zhang22w.pdf},
 publisher = {PMLR},
 series = {Proceedings of Machine Learning Research},
 title = {Neurotoxin: Durable Backdoors in Federated Learning},
 url = {https://proceedings.mlr.press/v162/zhang22w.html},
 volume = {162},
 year = {2022}
}

Shishir G. Patil, Paras Jain, Prabal Dutta, Ion Stoica, and Joseph Gonzalez. "POET: Training Neural Networks on Tiny Devices with Integrated Rematerialization and Paging." Proceedings of the 39th International Conference on Machine Learning, 2022.

Fine-tuning models on edge devices like mobile phones would enable privacy-preserving personalization over sensitive data. However, edge training has historically been limited to relatively small models with simple architectures because training is both memory and energy intensive. We present POET, an algorithm to enable training large neural networks on memory-scarce battery-operated edge devices. POET jointly optimizes the integrated search search spaces of rematerialization and paging, two algorithms to reduce the memory consumption of backpropagation. Given a memory budget and a run-time constraint, we formulate a mixed-integer linear program (MILP) for energy-optimal training. Our approach enables training significantly larger models on embedded devices while reducing energy consumption while not modifying mathematical correctness of backpropagation. We demonstrate that it is possible to fine-tune both ResNet-18 and BERT within the memory constraints of a Cortex-M class embedded device while outperforming current edge training methods in energy efficiency. POET is an open-source project available at https://github.com/ShishirPatil/poet

@inproceedings{poet22,
 abstract = {Fine-tuning models on edge devices like mobile phones would enable privacy-preserving personalization over sensitive data. However, edge training has historically been limited to relatively small models with simple architectures because training is both memory and energy intensive. We present POET, an algorithm to enable training large neural networks on memory-scarce battery-operated edge devices. POET jointly optimizes the integrated search search spaces of rematerialization and paging, two algorithms to reduce the memory consumption of backpropagation. Given a memory budget and a run-time constraint, we formulate a mixed-integer linear program (MILP) for energy-optimal training. Our approach enables training significantly larger models on embedded devices while reducing energy consumption while not modifying mathematical correctness of backpropagation. We demonstrate that it is possible to fine-tune both ResNet-18 and BERT within the memory constraints of a Cortex-M class embedded device while outperforming current edge training methods in energy efficiency. POET is an open-source project available at https://github.com/ShishirPatil/poet},
 author = {Patil, Shishir G. and Jain, Paras and Dutta, Prabal and Stoica, Ion and Gonzalez, Joseph},
 bdsk-url-1 = {https://proceedings.mlr.press/v162/patil22b.html},
 booktitle = {Proceedings of the 39th International Conference on Machine Learning},
 code = {https://github.com/shishirpatil/poet},
 editor = {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
 keywords = {peerrev, selected},
 month = {7},
 pages = {17573--17583},
 pdf = {https://proceedings.mlr.press/v162/patil22b/patil22b.pdf},
 publisher = {PMLR},
 series = {Proceedings of Machine Learning Research},
 title = { {POET}: Training Neural Networks on Tiny Devices with Integrated Rematerialization and Paging},
 url = {https://proceedings.mlr.press/v162/patil22b.html},
 volume = {162},
 year = {2022}
}

Suzanne Petryk, Lisa Dunlap, Keyan Nasseri, Joseph Gonzalez, Trevor Darrell, and Anna Rohrbach. "On Guiding Visual Attention With Language Specification." Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2022.

While real world challenges typically define visual categories with language words or phrases, most visual classification methods define categories with numerical indicies. However, the language specification of the classes provides an especially useful prior for biased and noisy datasets, where it can help disambiguate what features are task-relevant. Recently, large-scale multimodal models have been shown to recognize a wide variety of high-level concepts from a language specification even without additional image training data, but they are often unable to distinguish classes for more fine-grained tasks. CNNs, in contrast, can extract subtle image features that are required for fine-grained discrimination, but will overfit to any bias or noise in datasets. Our insight is to use high-level language specification as advice for constraining the prediction evidence to task-relevant features, instead of distractors. To do this, we ground task-relevant words or phrases with attention maps from a pretrained large-scale model. We then use this grounding to supervise a classifier's spatial attention away from distracting context. We show that supervising spatial attention in this way improves performance on classification tasks with biased and noisy data, including 3-15\% worst-group accuracy improvements and 41-45\% relative improvements on fairness metrics.

@inproceedings{Petryk_2022_CVPR,
 abstract = {While real world challenges typically define visual categories with language words or phrases, most visual classification methods define categories with numerical indicies. However, the language specification of the classes provides an especially useful prior for biased and noisy datasets, where it can help disambiguate what features are task-relevant. Recently, large-scale multimodal models have been shown to recognize a wide variety of high-level concepts from a language specification even without additional image training data, but they are often unable to distinguish classes for more fine-grained tasks. CNNs, in contrast, can extract subtle image features that are required for fine-grained discrimination, but will overfit to any bias or noise in datasets. Our insight is to use high-level language specification as advice for constraining the prediction evidence to task-relevant features, instead of distractors. To do this, we ground task-relevant words or phrases with attention maps from a pretrained large-scale model. We then use this grounding to supervise a classifier's spatial attention away from distracting context. We show that supervising spatial attention in this way improves performance on classification tasks with biased and noisy data, including 3-15\% worst-group accuracy improvements and 41-45\% relative improvements on fairness metrics.},
 author = {Petryk, Suzanne and Dunlap, Lisa and Nasseri, Keyan and Gonzalez, Joseph and Darrell, Trevor and Rohrbach, Anna},
 booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
 keywords = {peerrev, selected},
 month = {6},
 pages = {18092-18102},
 title = {On Guiding Visual Attention With Language Specification},
 year = {2022}
}

Wenshuo Guo, Kirthevasan Kandasamy, Joseph Gonzalez, Michael Jordan, and Ion Stoica. "Learning Competitive Equilibria in Exchange Economies with Bandit Feedback." Proceedings of The 25th International Conference on Artificial Intelligence and Statistics, 2022.

The sharing of scarce resources among multiple rational agents is one of the classical problems in economics. In exchange economies, which are used to model such situations, agents begin with an initial endowment of resources and exchange them in a way that is mutually beneficial until they reach a competitive equilibrium (CE). The allocations at a CE are Pareto efficient and fair. Consequently, they are used widely in designing mechanisms for fair division. However, computing CEs requires the knowledge of agent preferences which are unknown in several applications of interest. In this work, we explore a new online learning mechanism, which, on each round, allocates resources to the agents and collects stochastic feedback on their experience in using that allocation. Its goal is to learn the agent utilities via this feedback and imitate the allocations at a CE in the long run. We quantify CE behavior via two losses and propose a randomized algorithm which achieves sublinear loss under a parametric class of utilities. Empirically, we demonstrate the effectiveness of this mechanism through numerical simulations.

@inproceedings{pmlrv151guo22a,
 abstract = { The sharing of scarce resources among multiple rational agents is one of the classical problems in economics. In exchange economies, which are used to model such situations, agents begin with an initial endowment of resources and exchange them in a way that is mutually beneficial until they reach a competitive equilibrium (CE). The allocations at a CE are Pareto efficient and fair. Consequently, they are used widely in designing mechanisms for fair division. However, computing CEs requires the knowledge of agent preferences which are unknown in several applications of interest. In this work, we explore a new online learning mechanism, which, on each round, allocates resources to the agents and collects stochastic feedback on their experience in using that allocation. Its goal is to learn the agent utilities via this feedback and imitate the allocations at a CE in the long run. We quantify CE behavior via two losses and propose a randomized algorithm which achieves sublinear loss under a parametric class of utilities. Empirically, we demonstrate the effectiveness of this mechanism through numerical simulations. },
 author = {Guo, Wenshuo and Kandasamy, Kirthevasan and Gonzalez, Joseph and Jordan, Michael and Stoica, Ion},
 bdsk-url-1 = {https://proceedings.mlr.press/v151/guo22a.html},
 booktitle = {Proceedings of The 25th International Conference on Artificial Intelligence and Statistics},
 editor = {Camps-Valls, Gustau and Ruiz, Francisco J. R. and Valera, Isabel},
 keywords = {peerrev, selected},
 month = {3},
 pages = {6200--6224},
 pdf = {https://proceedings.mlr.press/v151/guo22a/guo22a.pdf},
 publisher = {PMLR},
 series = {Proceedings of Machine Learning Research},
 title = {Learning Competitive Equilibria in Exchange Economies with Bandit Feedback},
 url = {https://proceedings.mlr.press/v151/guo22a.html},
 volume = {151},
 year = {2022}
}

Albert Wilcox, Ashwin Balakrishna, Brijen Thananjeyan, Joseph E. Gonzalez, and Ken Goldberg. "LS3: Latent Space Safe Sets for Long-Horizon Visuomotor Control of Sparse Reward Iterative Tasks." Proceedings of the 5th Conference on Robot Learning, 2022.

Reinforcement learning (RL) has shown impressive success in exploring high-dimensional environments to learn complex tasks, but can often exhibit unsafe behaviors and require extensive environment interaction when exploration is unconstrained. A promising strategy for learning in dynamically uncertain environments is requiring that the agent can robustly return to learned Safe Sets, where task success (and therefore safety) can be guaranteed. While this approach has been successful in low-dimensions, enforcing this constraint in environments with visual observation spaces is exceedingly challenging. We present a novel continuous representation for Safe Sets framed as a binary classification problem in a learned latent space, which flexibly scales to high-dimensional image observations. We then present a new algorithm, Latent Space Safe Sets (LS3), which uses this representation for long-horizon control. We evaluate LS3 on 4 domains, including a challenging sequential pushing task in simulation and a physical cable routing task. We find that LS3 can use prior task successes to restrict exploration and learn more efficiently than prior algorithms while satisfying constraints. See https://tinyurl.com/latent-safe-sets for supplementary material.

@inproceedings{pmlrv164wilcox22a,
 abstract = {Reinforcement learning (RL) has shown impressive success in exploring high-dimensional environments to learn complex tasks, but can often exhibit unsafe behaviors and require extensive environment interaction when exploration is unconstrained. A promising strategy for learning in dynamically uncertain environments is requiring that the agent can robustly return to learned Safe Sets, where task success (and therefore safety) can be guaranteed. While this approach has been successful in low-dimensions, enforcing this constraint in environments with visual observation spaces is exceedingly challenging. We present a novel continuous representation for Safe Sets framed as a binary classification problem in a learned latent space, which flexibly scales to high-dimensional image observations. We then present a new algorithm, Latent Space Safe Sets (LS3), which uses this representation for long-horizon control. We evaluate LS3 on 4 domains, including a challenging sequential pushing task in simulation and a physical cable routing task. We find that LS3 can use prior task successes to restrict exploration and learn more efficiently than prior algorithms while satisfying constraints. See https://tinyurl.com/latent-safe-sets for supplementary material.},
 author = {Wilcox, Albert and Balakrishna, Ashwin and Thananjeyan, Brijen and Gonzalez, Joseph E. and Goldberg, Ken},
 bdsk-url-1 = {https://proceedings.mlr.press/v164/wilcox22a.html},
 booktitle = {Proceedings of the 5th Conference on Robot Learning},
 editor = {Faust, Aleksandra and Hsu, David and Neumann, Gerhard},
 keywords = {peerrev, selected},
 month = {11},
 pages = {959--969},
 pdf = {https://proceedings.mlr.press/v164/wilcox22a/wilcox22a.pdf},
 publisher = {PMLR},
 series = {Proceedings of Machine Learning Research},
 title = {LS3: Latent Space Safe Sets for Long-Horizon Visuomotor Control of Sparse Reward Iterative Tasks},
 url = {https://proceedings.mlr.press/v164/wilcox22a.html},
 volume = {164},
 year = {2022}
}

Brijen Thananjeyan, Justin Kerr, Huang Huang, Joseph E. Gonzalez, and Ken Goldberg. "All You Need is LUV: Unsupervised Collection of Labeled Images using Invisible UV Fluorescent Indicators." arXiv, 2022.

Large-scale semantic image annotation is a significant challenge for learning-based perception systems in robotics. Current approaches often rely on human labelers, which can be expensive, or simulation data, which can visually or physically differ from real data. This paper proposes Labels from UltraViolet (LUV), a novel framework that enables rapid, labeled data collection in real manipulation environments without human labeling. LUV uses transparent, ultraviolet-fluorescent paint with programmable ultraviolet LEDs to collect paired images of a scene in standard lighting and UV lighting to autonomously extract segmentation masks and keypoints via color segmentation. We apply LUV to a suite of diverse robot perception tasks to evaluate its labeling quality, flexibility, and data collection rate. Results suggest that LUV is 180-2500 times faster than a human labeler across the tasks. We show that LUV provides labels consistent with human annotations on unpainted test images. The networks trained on these labels are used to smooth and fold crumpled towels with 83\% success rate and achieve 1.7mm position error with respect to human labels on a surgical needle pose estimation task. The low cost of LUV makes it ideal as a lightweight replacement for human labeling systems, with the one-time setup costs at \$300 equivalent to the cost of collecting around 200 semantic segmentation labels on Amazon Mechanical Turk. Code, datasets, visualizations, and supplementary material can be found on github.

@misc{luv22,
 abstract = {Large-scale semantic image annotation is a significant challenge for learning-based perception systems in robotics. Current approaches often rely on human labelers, which can be expensive, or simulation data, which can visually or physically differ from real data. This paper proposes Labels from UltraViolet (LUV), a novel framework that enables rapid, labeled data collection in real manipulation environments without human labeling. LUV uses transparent, ultraviolet-fluorescent paint with programmable ultraviolet LEDs to collect paired images of a scene in standard lighting and UV lighting to autonomously extract segmentation masks and keypoints via color segmentation. We apply LUV to a suite of diverse robot perception tasks to evaluate its labeling quality, flexibility, and data collection rate. Results suggest that LUV is 180-2500 times faster than a human labeler across the tasks. We show that LUV provides labels consistent with human annotations on unpainted test images. The networks trained on these labels are used to smooth and fold crumpled towels with 83\% success rate and achieve 1.7mm position error with respect to human labels on a surgical needle pose estimation task. The low cost of LUV makes it ideal as a lightweight replacement for human labeling systems, with the one-time setup costs at \$300 equivalent to the cost of collecting around 200 semantic segmentation labels on Amazon Mechanical Turk. Code, datasets, visualizations, and supplementary material can be found on github.},
 arxivurl = {https://arxiv.org/abs/2203.04566},
 author = {Thananjeyan, Brijen and Kerr, Justin and Huang, Huang and Gonzalez, Joseph E. and Goldberg, Ken},
 bdsk-url-1 = {https://sites.google.com/berkeley.edu/luv},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2203.04566},
 code = {https://sites.google.com/berkeley.edu/luv},
 copyright = {arXiv.org perpetual, non-exclusive license},
 doi = {10.48550/ARXIV.2203.04566},
 keywords = {arxivpre},
 publisher = {arXiv},
 title = {All You Need is LUV: Unsupervised Collection of Labeled Images using Invisible UV Fluorescent Indicators},
 url = {https://sites.google.com/berkeley.edu/luv},
 year = {2022}
}

Ionel Gog, Sukrit Kalra, Peter Schafhalter, Joseph E. Gonzalez, and Ion Stoica. "D3: A Dynamic Deadline-Driven Approach for Building Autonomous Vehicles." Proceedings of the Seventeenth European Conference on Computer Systems, 2022.

Autonomous vehicles (AVs) must drive across a variety of challenging environments that impose continuously-varying deadlines and runtime-accuracy tradeoffs on their software pipelines. A deadline-driven execution of such AV pipelines requires a new class of systems that enable the computation to maximize accuracy under dynamically-varying deadlines. Designing these systems presents interesting challenges that arise from combining ease-of-development of AV pipelines with deadline specification and enforcement mechanisms.Our work addresses these challenges through D3 (Dynamic Deadline-Driven), a novel execution model that centralizes the deadline management, and allows applications to adjust their computation by modeling missed deadlines as exceptions. Further, we design and implement ERDOS, an open-source realization of D3 for AV pipelines that exposes finegrained execution events to applications, and provides mechanisms to speculatively execute computation and enforce deadlines between an arbitrary set of events. Finally, we address the crucial lack of AV benchmarks through our state-of-the-art open-source AV pipeline, Pylot, that works seamlessly across simulators and real AVs. We evaluate the efficacy of D3 and ERDOS by driving Pylot across challenging driving scenarios spanning 50km, and observe a 68\% reduction in collisions as compared to prior execution models.

@inproceedings{erdos22,
 abstract = {Autonomous vehicles (AVs) must drive across a variety of challenging environments that impose continuously-varying deadlines and runtime-accuracy tradeoffs on their software pipelines. A deadline-driven execution of such AV pipelines requires a new class of systems that enable the computation to maximize accuracy under dynamically-varying deadlines. Designing these systems presents interesting challenges that arise from combining ease-of-development of AV pipelines with deadline specification and enforcement mechanisms.Our work addresses these challenges through D3 (Dynamic Deadline-Driven), a novel execution model that centralizes the deadline management, and allows applications to adjust their computation by modeling missed deadlines as exceptions. Further, we design and implement ERDOS, an open-source realization of D3 for AV pipelines that exposes finegrained execution events to applications, and provides mechanisms to speculatively execute computation and enforce deadlines between an arbitrary set of events. Finally, we address the crucial lack of AV benchmarks through our state-of-the-art open-source AV pipeline, Pylot, that works seamlessly across simulators and real AVs. We evaluate the efficacy of D3 and ERDOS by driving Pylot across challenging driving scenarios spanning 50km, and observe a 68\% reduction in collisions as compared to prior execution models.},
 address = {New York, NY, USA},
 author = {Gog, Ionel and Kalra, Sukrit and Schafhalter, Peter and Gonzalez, Joseph E. and Stoica, Ion},
 bdsk-url-1 = {https://doi.org/10.1145/3492321.3519576},
 booktitle = {Proceedings of the Seventeenth European Conference on Computer Systems},
 code = {https://github.com/erdos-project/erdos},
 doi = {10.1145/3492321.3519576},
 isbn = {9781450391627},
 keywords = {peerrev, selected},
 location = {Rennes, France},
 numpages = {19},
 pages = {453--471},
 publisher = {Association for Computing Machinery},
 series = {EuroSys '22},
 title = {D3: A Dynamic Deadline-Driven Approach for Building Autonomous Vehicles},
 url = {https://doi.org/10.1145/3492321.3519576},
 year = {2022}
}

Bichen Wu, Ruizhe Cheng, Peizhao Zhang, Tianren Gao, Joseph E. Gonzalez, and Peter Vajda. "Data Efficient Language-Supervised Zero-Shot Recognition with Optimal Transport Distillation." International Conference on Learning Representations, 2022.

Traditional computer vision models are trained to predict a fixed set of predefined categories. Recently, natural language has been shown to be a broader and richer source of supervision that provides finer descriptions to visual concepts than supervised "gold" labels. Previous works, such as CLIP, use InfoNCE loss to train a model to predict the pairing between images and text captions. CLIP, however, is data hungry and requires more than 400M image-text pairs for training. The inefficiency can be partially attributed to the fact that the image-text pairs are noisy. To address this, we propose OTTER (Optimal TransporT distillation for Efficient zero-shot Recognition), which uses online entropic optimal transport to find a soft image-text match as labels for contrastive learning. Based on pretrained image and text encoders, models trained with OTTER achieve strong performance with only 3M image text pairs. Compared with InfoNCE loss, label smoothing, and knowledge distillation, OTTER consistently outperforms these baselines in zero shot evaluation on Google Open Images (19,958 classes) and multi-labeled ImageNet 10K (10032 classes) from Tencent ML-Images. Over 42 evaluations on 7 different dataset/architecture settings x 6 metrics, OTTER outperforms (32) or ties (2) all baselines in 34 of them.

@inproceedings{wu2022data,
 abstract = {Traditional computer vision models are trained to predict a fixed set of predefined categories. Recently, natural language has been shown to be a broader and richer source of supervision that provides finer descriptions to visual concepts than supervised "gold" labels. Previous works, such as CLIP, use InfoNCE loss to train a model to predict the pairing between images and text captions. CLIP, however, is data hungry and requires more than 400M image-text pairs for training. The inefficiency can be partially attributed to the fact that the image-text pairs are noisy. To address this, we propose OTTER (Optimal TransporT distillation for Efficient zero-shot Recognition), which uses online entropic optimal transport to find a soft image-text match as labels for contrastive learning. Based on pretrained image and text encoders, models trained with OTTER achieve strong performance with only 3M image text pairs. Compared with InfoNCE loss, label smoothing, and knowledge distillation, OTTER consistently outperforms these baselines in zero shot evaluation on Google Open Images (19,958 classes) and multi-labeled ImageNet 10K (10032 classes) from Tencent ML-Images. Over 42 evaluations on 7 different dataset/architecture settings x 6 metrics, OTTER outperforms (32) or ties (2) all baselines in 34 of them.},
 author = {Bichen Wu and Ruizhe Cheng and Peizhao Zhang and Tianren Gao and Joseph E. Gonzalez and Peter Vajda},
 bdsk-url-1 = {https://openreview.net/forum?id=G89-1yZLFHk},
 booktitle = {International Conference on Learning Representations},
 keywords = {peerrev, selected},
 title = {Data Efficient Language-Supervised Zero-Shot Recognition with Optimal Transport Distillation},
 url = {https://openreview.net/forum?id=G89-1yZLFHk},
 year = {2022}
}

Yaoqing Yang, Ryan Theisen, Liam Hodgkinson, Joseph E. Gonzalez, Kannan Ramchandran, Charles H. Martin, and Michael W. Mahoney. "Evaluating natural language processing models with generalization metrics that do not need access to any training or testing data." arXiv, 2022.

The search for effective and robust generalization metrics has been the focus of recent theoretical and empirical work. In this paper, we discuss the performance of natural language processing (NLP) models, and we evaluate various existing and novel generalization metrics. Compared to prior studies, we (i) focus on NLP instead of computer vision (CV), (ii) focus on generalization metrics that predict test error instead of the generalization gap, (iii) focus on generalization metrics that do not need the access to data, and (iv) focus on the heavy-tail (HT) phenomenon that has received comparatively less attention in the study of deep neural networks (NNs). We extend recent HT-based work which focuses on power law (PL) distributions, and we study exponential (EXP) and exponentially truncated power law (E-TPL) fitting to the empirical spectral densities (ESDs) of weight matrices. Our detailed empirical studies show that (i) \emph{shape metrics}, or the metrics obtained from fitting the shape of the ESDs, perform uniformly better at predicting generalization performance than \emph{scale metrics} commonly studied in the literature, as measured by the \emph{average} rank correlations with the generalization performance for all of our experiments; (ii) among forty generalization metrics studied in our paper, the RANDDISTANCE metric, a new shape metric invented in this paper that measures the distance between empirical eigenvalues of weight matrices and those of randomly initialized weight matrices, achieves the highest worst-case rank correlation with generalization performance under a variety of training settings; and (iii) among the three HT distributions considered in our paper, the E-TPL fitting of ESDs performs the most robustly.

@misc{Yaoqing22,
 abstract = {The search for effective and robust generalization metrics has been the focus of recent theoretical and empirical work. 
In this paper, we discuss the performance of natural language processing (NLP) models, and we evaluate various existing and novel generalization metrics. 
Compared to prior studies, we 
(i) focus on NLP instead of computer vision (CV), 
(ii) focus on generalization metrics that predict test error instead of the generalization gap, 
(iii) focus on generalization metrics that do not need the access to data, and 
(iv) focus on the heavy-tail (HT) phenomenon that has received comparatively less attention in the study of deep neural networks (NNs). 
We extend recent HT-based work which focuses on power law (PL) distributions, and we study exponential (EXP) and exponentially truncated power law (E-TPL) fitting to the empirical spectral densities (ESDs) of weight matrices. 
Our detailed empirical studies show that 
(i) \emph{shape metrics}, or the metrics obtained from fitting the shape of the ESDs, perform uniformly better at predicting generalization performance than \emph{scale metrics} commonly studied in the literature, as measured by the \emph{average} rank correlations with the generalization performance for all of our experiments; 
(ii) among forty generalization metrics studied in our paper, the RANDDISTANCE metric, a new shape metric invented in this paper that measures the distance between empirical eigenvalues of weight matrices and those of randomly initialized weight matrices, achieves the highest worst-case rank correlation with generalization performance under a variety of training settings; and 
(iii) among the three HT distributions considered in our paper, the E-TPL fitting of ESDs performs the most robustly.},
 author = {Yang, Yaoqing and Theisen, Ryan and Hodgkinson, Liam and Gonzalez, Joseph E. and Ramchandran, Kannan and Martin, Charles H. and Mahoney, Michael W.},
 bdsk-url-1 = {https://arxiv.org/abs/2202.02842},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2202.02842},
 copyright = {arXiv.org perpetual, non-exclusive license},
 doi = {10.48550/ARXIV.2202.02842},
 keywords = {arxivpre, Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
 publisher = {arXiv},
 title = {Evaluating natural language processing models with generalization metrics that do not need access to any training or testing data},
 url = {https://arxiv.org/abs/2202.02842},
 year = {2022}
}

Jeffrey Ichnowski, Kaiyuan Chen, Karthik Dharmarajan, Simeon Adebola, Michael Danielczuk, Vıctor Mayoral-Vilches, Hugo Zhan, Derek Xu, Ramtin Ghassemi, John Kubiatowicz, Ion Stoica, Joseph Gonzalez, and Ken Goldberg. "FogROS 2: An Adaptive and Extensible Platform for Cloud and Fog Robotics Using ROS 2." arXiv, 2022.

Mobility, power, and price points often dictate that robots do not have sufficient computing power on board to run modern robot algorithms at desired rates. Cloud computing providers such as AWS, GCP, and Azure offer immense computing power on demand, but tapping into that power from a robot is non-trivial. In this paper, we present FogROS 2, an easy-to-use, open-source platform to facilitate cloud and fog robotics compatible with the emerging ROS 2 standard, extending the open-source Robot Operating System (ROS). FogROS 2 provisions a cloud computer, deploys and launches ROS 2 nodes to the cloud computer, sets up secure networking between the robot and cloud, and starts the application running. FogROS 2 is completely redesigned and distinct from its predecessor to support ROS 2 applications, transparent video compression and communication, improved performance and security, support for multiple cloud-computing providers, and remote monitoring and visualization. We demonstrate in example applications that the performance gained by using cloud computers can overcome the network latency to significantly speed up robot performance. In examples, FogROS 2 reduces SLAM latency by 50\%, reduces grasp planning time from 14s to 1.2s, and speeds up motion planning 28x. When compared to alternatives, FogROS 2 reduces network utilization by up to 3.8x. FogROS 2, source, examples, and documentation is available on github.

@misc{fogross2,
 abstract = {Mobility, power, and price points often dictate that robots do not have sufficient computing power on board to run modern robot algorithms at desired rates. Cloud computing providers such as AWS, GCP, and Azure offer immense computing power on demand, but tapping into that power from a robot is non-trivial. In this paper, we present FogROS 2, an easy-to-use, open-source platform to facilitate cloud and fog robotics compatible with the emerging ROS 2 standard, extending the open-source Robot Operating System (ROS). FogROS 2 provisions a cloud computer, deploys and launches ROS 2 nodes to the cloud computer, sets up secure networking between the robot and cloud, and starts the application running. FogROS 2 is completely redesigned and distinct from its predecessor to support ROS 2 applications, transparent video compression and communication, improved performance and security, support for multiple cloud-computing providers, and remote monitoring and visualization. We demonstrate in example applications that the performance gained by using cloud computers can overcome the network latency to significantly speed up robot performance. In examples, FogROS 2 reduces SLAM latency by 50\%, reduces grasp planning time from 14s to 1.2s, and speeds up motion planning 28x. When compared to alternatives, FogROS 2 reduces network utilization by up to 3.8x. FogROS 2, source, examples, and documentation is available on github.},
 author = {Ichnowski, Jeffrey and Chen, Kaiyuan and Dharmarajan, Karthik and Adebola, Simeon and Danielczuk, Michael and Mayoral-Vilches, Vıctor and Zhan, Hugo and Xu, Derek and Ghassemi, Ramtin and Kubiatowicz, John and Stoica, Ion and Gonzalez, Joseph and Goldberg, Ken},
 bdsk-url-1 = {https://arxiv.org/abs/2205.09778},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2205.09778},
 code = {https://github.com/BerkeleyAutomation/FogROS2},
 copyright = {Creative Commons Attribution 4.0 International},
 doi = {10.48550/ARXIV.2205.09778},
 keywords = {arxivpre},
 publisher = {arXiv},
 title = {FogROS 2: An Adaptive and Extensible Platform for Cloud and Fog Robotics Using ROS 2},
 url = {https://arxiv.org/abs/2205.09778},
 year = {2022}
}

Sam Lau, Deborah Nolan, Joseph Gonzalez, and Philip J. Guo. "How Computer Science and Statistics Instructors Approach Data Science Pedagogy Differently: Three Case Studies." Proceedings of the 53rd ACM Technical Symposium on Computer Science Education V. 1, 2022.

Over the past decade, data science courses have been growing more popular across university campuses. These courses often involve a mix of programming and statistics and are taught by instructors from diverse backgrounds. In our experiences launching a data science program at a large public U.S. university over the past four years, we noticed one central tension within many such courses: instructors must finely balance how much computing versus statistics to teach in the limited available time. In this experience report, we provide a detailed firsthand reflection on how we have personally balanced these two major topic areas within several offerings of a large introductory data science course that we taught and wrote an accompanying textbook for; our course has served several thousand students over the past four years. We present three case studies from our experiences to illustrate how computer science and statistics instructors approach data science differently on topics ranging from algorithmic depth to modeling to data acquisition. We then draw connections to deeper tradeoffs in data science to help guide instructors who design interdisciplinary courses. We conclude by suggesting ways that instructors can incorporate both computer science and statistics perspectives to improve data science teaching.

@inproceedings{Lau22,
 abstract = {Over the past decade, data science courses have been growing more popular across university campuses. These courses often involve a mix of programming and statistics and are taught by instructors from diverse backgrounds. In our experiences launching a data science program at a large public U.S. university over the past four years, we noticed one central tension within many such courses: instructors must finely balance how much computing versus statistics to teach in the limited available time. In this experience report, we provide a detailed firsthand reflection on how we have personally balanced these two major topic areas within several offerings of a large introductory data science course that we taught and wrote an accompanying textbook for; our course has served several thousand students over the past four years. We present three case studies from our experiences to illustrate how computer science and statistics instructors approach data science differently on topics ranging from algorithmic depth to modeling to data acquisition. We then draw connections to deeper tradeoffs in data science to help guide instructors who design interdisciplinary courses. We conclude by suggesting ways that instructors can incorporate both computer science and statistics perspectives to improve data science teaching.},
 address = {New York, NY, USA},
 author = {Lau, Sam and Nolan, Deborah and Gonzalez, Joseph and Guo, Philip J.},
 bdsk-url-1 = {https://doi.org/10.1145/3478431.3499384},
 booktitle = {Proceedings of the 53rd ACM Technical Symposium on Computer Science Education V. 1},
 doi = {10.1145/3478431.3499384},
 isbn = {9781450390705},
 keywords = {data science, programming education, case studies, statistics},
 location = {Providence, RI, USA},
 numpages = {7},
 pages = {29--35},
 publisher = {Association for Computing Machinery},
 series = {SIGCSE 2022},
 title = {How Computer Science and Statistics Instructors Approach Data Science Pedagogy Differently: Three Case Studies},
 url = {https://doi.org/10.1145/3478431.3499384},
 year = {2022}
}

Paras Jain, Safeen Huda, Martin Maas, Joseph E. Gonzalez, Ion Stoical, and Azalia Mirhoseini. "Learning to Design Accurate Deep Learning Accelerators with Inaccurate Multipliers." 2022 Design, Automation \& Test in Europe Conference \& Exhibition (DATE), 2022.

Approximate computing is a promising way to improve the power efficiency of deep learning. While recent work proposes new arithmetic circuits (adders and multipliers) that consume substantially less power at the cost of computation errors, these approximate circuits decrease the end-to-end accuracy of common models. We present AutoApprox, a framework to automatically generate approximate low-power deep learning accelerators without any accuracy loss. AutoApprox generates a wide range of approximate ASIC accelerators with a TPUv3 systolic-array template. AutoApprox uses a learned router to assign each DNN layer to an approximate systolic array from a bank of arrays with varying approximation levels. By tailoring this routing for a specific neural network architecture, we discover circuit designs without the accuracy penalty from prior methods. Moreover, AutoApprox optimizes for the end-to-end performance, power and area of the the whole chip and PE mapping rather than simply measuring the performance of the arithmetic units in iso-lation. To our knowledge, our work is the first to demonstrate the effectiveness of custom-tailored approximate circuits in delivering significant chip-level energy savings with zero accuracy loss on a large-scale dataset such as ImageNet. AutoApprox synthesizes a novel approximate accelerator based on the TPU that reduces end-to-end power consumption by 3.2\% and area by 5.2\% at a sub-10nm process with no degradation in ImageNet validation top-1 and top-5 accuracy.

@inproceedings{Jane22,
 abstract = {Approximate computing is a promising way to improve the power efficiency of deep learning. While recent work proposes new arithmetic circuits (adders and multipliers) that consume substantially less power at the cost of computation errors, these approximate circuits decrease the end-to-end accuracy of common models. We present AutoApprox, a framework to automatically generate approximate low-power deep learning accelerators without any accuracy loss. AutoApprox generates a wide range of approximate ASIC accelerators with a TPUv3 systolic-array template. AutoApprox uses a learned router to assign each DNN layer to an approximate systolic array from a bank of arrays with varying approximation levels. By tailoring this routing for a specific neural network architecture, we discover circuit designs without the accuracy penalty from prior methods. Moreover, AutoApprox optimizes for the end-to-end performance, power and area of the the whole chip and PE mapping rather than simply measuring the performance of the arithmetic units in iso-lation. To our knowledge, our work is the first to demonstrate the effectiveness of custom-tailored approximate circuits in delivering significant chip-level energy savings with zero accuracy loss on a large-scale dataset such as ImageNet. AutoApprox synthesizes a novel approximate accelerator based on the TPU that reduces end-to-end power consumption by 3.2\% and area by 5.2\% at a sub-10nm process with no degradation in ImageNet validation top-1 and top-5 accuracy.},
 author = {Jain, Paras and Huda, Safeen and Maas, Martin and Gonzalez, Joseph E. and Stoical, Ion and Mirhoseini, Azalia},
 bdsk-url-1 = {https://www.parasjain.com/projects/21autoapprox_date/paper.pdf},
 bdsk-url-2 = {https://doi.org/10.23919/DATE54114.2022.9774607},
 booktitle = {2022 Design, Automation \& Test in Europe Conference \& Exhibition (DATE)},
 doi = {10.23919/DATE54114.2022.9774607},
 keywords = {peerrev, selected},
 pages = {184-189},
 title = {Learning to Design Accurate Deep Learning Accelerators with Inaccurate Multipliers},
 url = {https://www.parasjain.com/projects/21autoapprox_date/paper.pdf},
 year = {2022}
}

Spencer Whitehead, Suzanne Petryk, Vedaad Shakib, Joseph Gonzalez, Trevor Darrell, Anna Rohrbach, and Marcus Rohrbach. "Reliable Visual Question Answering: Abstain Rather Than Answer Incorrectly." arXiv, 2022.

Machine learning has advanced dramatically, narrowing the accuracy gap to humans in multimodal tasks like visual question answering (VQA). However, while humans can say "I don't know" when they are uncertain (i.e., abstain from answering a question), such ability has been largely neglected in multimodal research, despite the importance of this problem to the usage of VQA in real settings. In this work, we promote a problem formulation for reliable VQA, where we prefer abstention over providing an incorrect answer. We first enable abstention capabilities for several VQA models, and analyze both their coverage, the portion of questions answered, and risk, the error on that portion. For that, we explore several abstention approaches. We find that although the best performing models achieve over 71\% accuracy on the VQA v2 dataset, introducing the option to abstain by directly using a model's softmax scores limits them to answering less than 8\% of the questions to achieve a low risk of error (i.e., 1\%). This motivates us to utilize a multimodal selection function to directly estimate the correctness of the predicted answers, which we show can increase the coverage by, for example, 2.4x from 6.8\% to 16.3\% at 1\% risk. While it is important to analyze both coverage and risk, these metrics have a trade-off which makes comparing VQA models challenging. To address this, we also propose an Effective Reliability metric for VQA that places a larger cost on incorrect answers compared to abstentions. This new problem formulation, metric, and analysis for VQA provide the groundwork for building effective and reliable VQA models that have the self-awareness to abstain if and only if they don't know the answer.

@misc{reliableVAQ22,
 abstract = {Machine learning has advanced dramatically, narrowing the accuracy gap to humans in multimodal tasks like visual question answering (VQA). However, while humans can say "I don't know" when they are uncertain (i.e., abstain from answering a question), such ability has been largely neglected in multimodal research, despite the importance of this problem to the usage of VQA in real settings. In this work, we promote a problem formulation for reliable VQA, where we prefer abstention over providing an incorrect answer. We first enable abstention capabilities for several VQA models, and analyze both their coverage, the portion of questions answered, and risk, the error on that portion. For that, we explore several abstention approaches. We find that although the best performing models achieve over 71\% accuracy on the VQA v2 dataset, introducing the option to abstain by directly using a model's softmax scores limits them to answering less than 8\% of the questions to achieve a low risk of error (i.e., 1\%). This motivates us to utilize a multimodal selection function to directly estimate the correctness of the predicted answers, which we show can increase the coverage by, for example, 2.4x from 6.8\% to 16.3\% at 1\% risk. While it is important to analyze both coverage and risk, these metrics have a trade-off which makes comparing VQA models challenging. To address this, we also propose an Effective Reliability metric for VQA that places a larger cost on incorrect answers compared to abstentions. This new problem formulation, metric, and analysis for VQA provide the groundwork for building effective and reliable VQA models that have the self-awareness to abstain if and only if they don't know the answer.},
 author = {Whitehead, Spencer and Petryk, Suzanne and Shakib, Vedaad and Gonzalez, Joseph and Darrell, Trevor and Rohrbach, Anna and Rohrbach, Marcus},
 bdsk-url-1 = {https://arxiv.org/abs/2204.13631},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2204.13631},
 copyright = {arXiv.org perpetual, non-exclusive license},
 doi = {10.48550/ARXIV.2204.13631},
 keywords = {arxivpre},
 publisher = {arXiv},
 title = {Reliable Visual Question Answering: Abstain Rather Than Answer Incorrectly},
 url = {https://arxiv.org/abs/2204.13631},
 year = {2022}
}

David Patterson, Joseph Gonzalez, Urs H\"olzle, Quoc Le, Chen Liang, Lluis-Miquel Munguia, Daniel Rothchild, David R. So, Maud Texier, and Jeff Dean. "The Carbon Footprint of Machine Learning Training Will Plateau, Then Shrink." Computer, 2022.

Machine Learning (ML) workloads have rapidly grown in importance, but raised concerns about their carbon footprint. Four best practices can reduce ML training energy by up to 100x and CO2 emissions up to 1000x. By following best practices, overall ML energy use (across research, development, and production) held steady at <15\% of Google's total energy use for the past three years. If the whole ML field were to adopt best practices, total carbon emissions from training would reduce. Hence, we recommend that ML papers include emissions explicitly to foster competition on more than just model quality. Estimates of emissions in papers that omitted them have been off 100x-100,000x, so publishing emissions has the added benefit of ensuring accurate accounting. Given the importance of climate change, we must get the numbers right to make certain that we work on its biggest challenges.

@article{patterson2022ieee,
 abstract = {Machine Learning (ML) workloads have rapidly grown in importance, but raised concerns about their carbon footprint. Four best practices can reduce ML training energy by up to 100x and CO2 emissions up to 1000x. By following best practices, overall ML energy use (across research, development, and production) held steady at <15\% of Google's total energy use for the past three years. If the whole ML field were to adopt best practices, total carbon emissions from training would reduce. Hence, we recommend that ML papers include emissions explicitly to foster competition on more than just model quality. Estimates of emissions in papers that omitted them have been off 100x-100,000x, so publishing emissions has the added benefit of ensuring accurate accounting. Given the importance of climate change, we must get the numbers right to make certain that we work on its biggest challenges.},
 author = {Patterson, David and Gonzalez, Joseph and H{\"o}lzle, Urs and Le, Quoc and Liang, Chen and Munguia, Lluis-Miquel and Rothchild, Daniel and So, David R. and Texier, Maud and Dean, Jeff},
 bdsk-url-1 = {https://arxiv.org/abs/2204.05149},
 bdsk-url-2 = {https://doi.org/10.1109/MC.2022.3148714},
 doi = {10.1109/MC.2022.3148714},
 journal = {Computer},
 keywords = {techreport, selected},
 number = {7},
 pages = {18-28},
 title = {The Carbon Footprint of Machine Learning Training Will Plateau, Then Shrink},
 url = {https://arxiv.org/abs/2204.05149},
 volume = {55},
 year = {2022}
}

Sarah Chasins, Alvin Cheung, Natacha Crooks, Ali Ghodsi, Ken Goldberg, Joseph E. Gonzalez, Joseph M. Hellerstein, Michael I. Jordan, Anthony D. Joseph, Michael W. Mahoney, Aditya Parameswaran, David Patterson, Raluca Ada Popa, Koushik Sen, Scott Shenker, Dawn Song, and Ion Stoica. "The Sky Above The Clouds." arXiv, 2022.

Technology ecosystems often undergo significant transformations as they mature. For example, telephony, the Internet, and PCs all started with a single provider, but in the United States each is now served by a competitive market that uses comprehensive and universal technology standards to provide compatibility. This white paper presents our view on how the cloud ecosystem, barely over fifteen years old, could evolve as it matures.

@misc{skyvision2022,
 abstract = {Technology ecosystems often undergo significant transformations as they mature. For example, telephony, the Internet, and PCs all started with a single provider, but in the United States each is now served by a competitive market that uses comprehensive and universal technology standards to provide compatibility. This white paper presents our view on how the cloud ecosystem, barely over fifteen years old, could evolve as it matures.},
 author = {Chasins, Sarah and Cheung, Alvin and Crooks, Natacha and Ghodsi, Ali and Goldberg, Ken and Gonzalez, Joseph E. and Hellerstein, Joseph M. and Jordan, Michael I. and Joseph, Anthony D. and Mahoney, Michael W. and Parameswaran, Aditya and Patterson, David and Popa, Raluca Ada and Sen, Koushik and Shenker, Scott and Song, Dawn and Stoica, Ion},
 bdsk-url-1 = {https://arxiv.org/abs/2205.07147},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2205.07147},
 copyright = {Creative Commons Attribution 4.0 International},
 doi = {10.48550/ARXIV.2205.07147},
 keywords = {arxivpre},
 publisher = {arXiv},
 title = {The Sky Above The Clouds},
 url = {https://arxiv.org/abs/2205.07147},
 year = {2022}
}

Kaiyuan Eric Chen, Yafei Liang, Nikhil Jha, Jeffrey Ichnowski, Michael Danielczuk, Joseph Gonzalez, John Kubiatowicz, and Ken Goldberg. "FogROS: An Adaptive Framework for Automating Fog Robotics Deployment." 2021 IEEE 17th International Conference on Automation Science and Engineering (CASE), 2021.

As many robot automation applications increasingly rely on multi-core processing or deep-learning models, cloud computing is becoming an attractive and economically viable resource for systems that do not contain high computing power onboard. Despite its immense computing capacity, it is often underused by the robotics and automation community due to lack of expertise in cloud computing and cloud-based infrastructure. Fog Robotics balances computing and data between cloud edge devices. We propose a software framework, FogROS, as an extension of the Robot Operating System (ROS), the defacto standard for creating robot automation applications and components. It allows researchers to deploy components of their software to the cloud with minimal effort, and correspondingly gain access to additional computing cores, GPUs, FPGAs, and TPUs, as well as predeployed software made available by other researchers. FogROS allows a researcher to specify which components of their software will be deployed to the cloud and to what type of computing hardware. We evaluate FogROS on 3 examples: (1) simultaneous localization and mapping (ORB-SLAM2), (2) Dexterity Network (Dex-Net) GPU-based grasp planning, and (3) multi-core motion planning using a 96-core cloud-based server. In all three examples, a component is deployed to the cloud and accelerated with a small change in system launch configuration, while incurring additional latency of 1.2 s, 0.6 s, and 0.5 s due to network communication, the computation speed is improved by 2.6×, 6.0× and 34.2×, respectively. Code, videos, and supplementary material can be found at https://github.com/BerkeleyAutomation/FogROS.

@inproceedings{Chen21,
 abstract = {As many robot automation applications increasingly rely on multi-core processing or deep-learning models, cloud computing is becoming an attractive and economically viable resource for systems that do not contain high computing power onboard. Despite its immense computing capacity, it is often underused by the robotics and automation community due to lack of expertise in cloud computing and cloud-based infrastructure. Fog Robotics balances computing and data between cloud edge devices. We propose a software framework, FogROS, as an extension of the Robot Operating System (ROS), the defacto standard for creating robot automation applications and components. It allows researchers to deploy components of their software to the cloud with minimal effort, and correspondingly gain access to additional computing cores, GPUs, FPGAs, and TPUs, as well as predeployed software made available by other researchers. FogROS allows a researcher to specify which components of their software will be deployed to the cloud and to what type of computing hardware. We evaluate FogROS on 3 examples: (1) simultaneous localization and mapping (ORB-SLAM2), (2) Dexterity Network (Dex-Net) GPU-based grasp planning, and (3) multi-core motion planning using a 96-core cloud-based server. In all three examples, a component is deployed to the cloud and accelerated with a small change in system launch configuration, while incurring additional latency of 1.2 s, 0.6 s, and 0.5 s due to network communication, the computation speed is improved by 2.6×, 6.0× and 34.2×, respectively. Code, videos, and supplementary material can be found at https://github.com/BerkeleyAutomation/FogROS.},
 author = {Chen, Kaiyuan Eric and Liang, Yafei and Jha, Nikhil and Ichnowski, Jeffrey and Danielczuk, Michael and Gonzalez, Joseph and Kubiatowicz, John and Goldberg, Ken},
 bdsk-url-1 = {https://arxiv.org/abs/2108.11355},
 bdsk-url-2 = {https://doi.org/10.1109/CASE49439.2021.9551628},
 booktitle = {2021 IEEE 17th International Conference on Automation Science and Engineering (CASE)},
 code = {https://github.com/BerkeleyAutomation/FogROS},
 doi = {10.1109/CASE49439.2021.9551628},
 issn = {2161-8089},
 keywords = {peerrev, selected},
 month = {8},
 pages = {2035-2042},
 title = {FogROS: An Adaptive Framework for Automating Fog Robotics Deployment},
 url = {https://arxiv.org/abs/2108.11355},
 year = {2021}
}

Jianfei Chen, Lianmin Zheng, Zhewei Yao, Dequan Wang, Ion Stoica, Michael Mahoney, and Joseph Gonzalez. "ActNN: Reducing Training Memory Footprint via 2-Bit Activation Compressed Training." Proceedings of the 38th International Conference on Machine Learning, 2021.

The increasing size of neural network models has been critical for improvements in their accuracy, but device memory is not growing at the same rate. This creates fundamental challenges for training neural networks within limited memory environments. In this work, we propose ActNN, a memory-efficient training framework that stores randomly quantized activations for back propagation. We prove the convergence of ActNN for general network architectures, and we characterize the impact of quantization on the convergence via an exact expression for the gradient variance. Using our theory, we propose novel mixed-precision quantization strategies that exploit the activation's heterogeneity across feature dimensions, samples, and layers. These techniques can be readily applied to existing dynamic graph frameworks, such as PyTorch, simply by substituting the layers. We evaluate ActNN on mainstream computer vision models for classification, detection, and segmentation tasks. On all these tasks, ActNN compresses the activation to 2 bits on average, with negligible accuracy loss. ActNN reduces the memory footprint of the activation by 12x, and it enables training with a 6.6x to 14x larger batch size.

@inproceedings{pmlrv139chen21z,
 abstract = {The increasing size of neural network models has been critical for improvements in their accuracy, but device memory is not growing at the same rate. This creates fundamental challenges for training neural networks within limited memory environments. In this work, we propose ActNN, a memory-efficient training framework that stores randomly quantized activations for back propagation. We prove the convergence of ActNN for general network architectures, and we characterize the impact of quantization on the convergence via an exact expression for the gradient variance. Using our theory, we propose novel mixed-precision quantization strategies that exploit the activation's heterogeneity across feature dimensions, samples, and layers. These techniques can be readily applied to existing dynamic graph frameworks, such as PyTorch, simply by substituting the layers. We evaluate ActNN on mainstream computer vision models for classification, detection, and segmentation tasks. On all these tasks, ActNN compresses the activation to 2 bits on average, with negligible accuracy loss. ActNN reduces the memory footprint of the activation by 12x, and it enables training with a 6.6x to 14x larger batch size.},
 author = {Chen, Jianfei and Zheng, Lianmin and Yao, Zhewei and Wang, Dequan and Stoica, Ion and Mahoney, Michael and Gonzalez, Joseph},
 bdsk-url-1 = {https://proceedings.mlr.press/v139/chen21z.html},
 booktitle = {Proceedings of the 38th International Conference on Machine Learning},
 editor = {Meila, Marina and Zhang, Tong},
 keywords = {peerrev, selected},
 month = {7},
 pages = {1803--1813},
 pdf = {http://proceedings.mlr.press/v139/chen21z/chen21z.pdf},
 publisher = {PMLR},
 series = {Proceedings of Machine Learning Research},
 title = {ActNN: Reducing Training Memory Footprint via 2-Bit Activation Compressed Training},
 url = {https://proceedings.mlr.press/v139/chen21z.html},
 volume = {139},
 year = {2021}
}

Brijen Thananjeyan, Ashwin Balakrishna, Suraj Nair, Michael Luo, Krishnan Srinivasan, Minho Hwang, Joseph E. Gonzalez, Julian Ibarz, Chelsea Finn, and Ken Goldberg. "Recovery RL: Safe Reinforcement Learning With Learned Recovery Zones." IEEE Robotics and Automation Letters, 2021.

Safety remains a central obstacle preventing widespread use of RL in the real world: learning new tasks in uncertain environments requires extensive exploration, but safety requires limiting exploration. We propose Recovery RL, an algorithm which navigates this tradeoff by (1) leveraging offline data to learn about constraint violating zones before policy learning and (2) separating the goals of improving task performance and constraint satisfaction across two policies: a task policy that only optimizes the task reward and a recovery policy that guides the agent to safety when constraint violation is likely. We evaluate Recovery RL on 6 simulation domains, including two contact-rich manipulation tasks and an image-based navigation task, and an image-based obstacle avoidance task on a physical robot. We compare Recovery RL to 5 prior safe RL methods which jointly optimize for task performance and safety via constrained optimization or reward shaping and find that Recovery RL outperforms the next best prior method across all domains. Results suggest that Recovery RL trades off constraint violations and task successes 2--20 times more efficiently in simulation domains and 3 times more efficiently in physical experiments. See \url{https://tinyurl.com/rl-recovery} for videos and supplementary material.

@article{Thananjeyan21c,
 abstract = {Safety remains a central obstacle preventing widespread use of RL in the real world: learning new tasks in uncertain environments requires extensive exploration, but safety requires limiting exploration. We propose Recovery RL, an algorithm which navigates this tradeoff by (1) leveraging offline data to learn about constraint violating zones before policy learning and (2) separating the goals of improving task performance and constraint satisfaction across two policies: a task policy that only optimizes the task reward and a recovery policy that guides the agent to safety when constraint violation is likely. We evaluate Recovery RL on 6 simulation domains, including two contact-rich manipulation tasks and an image-based navigation task, and an image-based obstacle avoidance task on a physical robot. We compare Recovery RL to 5 prior safe RL methods which jointly optimize for task performance and safety via constrained optimization or reward shaping and find that Recovery RL outperforms the next best prior method across all domains. Results suggest that Recovery RL trades off constraint violations and task successes 2--20 times more efficiently in simulation domains and 3 times more efficiently in physical experiments. See \url{https://tinyurl.com/rl-recovery} for videos and supplementary material.},
 author = {Thananjeyan, Brijen and Balakrishna, Ashwin and Nair, Suraj and Luo, Michael and Srinivasan, Krishnan and Hwang, Minho and Gonzalez, Joseph E. and Ibarz, Julian and Finn, Chelsea and Goldberg, Ken},
 bdsk-url-1 = {https://arxiv.org/abs/2010.15920},
 bdsk-url-2 = {https://doi.org/10.1109/LRA.2021.3070252},
 code = {https://tinyurl.com/rl-recovery},
 doi = {10.1109/LRA.2021.3070252},
 issn = {2377-3766},
 journal = {IEEE Robotics and Automation Letters},
 keywords = {peerrev, selected},
 month = {7},
 number = {3},
 pages = {4915-4922},
 title = {Recovery RL: Safe Reinforcement Learning With Learned Recovery Zones},
 url = {https://arxiv.org/abs/2010.15920},
 volume = {6},
 year = {2021}
}

Brijen Thananjeyan, Kirthevasan Kandasamy, Ion Stoica, Michael Jordan, Ken Goldberg, and Joseph Gonzalez. "Resource Allocation in Multi-armed Bandit Exploration: Overcoming Sublinear Scaling with Adaptive Parallelism." Proceedings of the 38th International Conference on Machine Learning, 2021.

We study exploration in stochastic multi-armed bandits when we have access to a divisible resource that can be allocated in varying amounts to arm pulls. We focus in particular on the allocation of distributed computing resources, where we may obtain results faster by allocating more resources per pull, but might have reduced throughput due to nonlinear scaling. For example, in simulation-based scientific studies, an expensive simulation can be sped up by running it on multiple cores. This speed-up however, is partly offset by the communication among cores, which results in lower throughput than if fewer cores were allocated to run more trials in parallel. In this paper, we explore these trade-offs in two settings. First, in a fixed confidence setting, we need to find the best arm with a given target success probability as quickly as possible. We propose an algorithm which trades off between information accumulation and throughput and show that the time taken can be upper bounded by the solution of a dynamic program whose inputs are the gaps between the sub-optimal and optimal arms. We also prove a matching hardness result. Second, we present an algorithm for a fixed deadline setting, where we are given a time deadline and need to maximize the probability of finding the best arm. We corroborate our theoretical insights with simulation experiments that show that the algorithms consistently match or outperform baseline algorithms on a variety of problem instances.

@inproceedings{thananjeyan21a,
 abstract = {We study exploration in stochastic multi-armed bandits when we have access to a divisible resource that can be allocated in varying amounts to arm pulls. We focus in particular on the allocation of distributed computing resources, where we may obtain results faster by allocating more resources per pull, but might have reduced throughput due to nonlinear scaling. For example, in simulation-based scientific studies, an expensive simulation can be sped up by running it on multiple cores. This speed-up however, is partly offset by the communication among cores, which results in lower throughput than if fewer cores were allocated to run more trials in parallel. In this paper, we explore these trade-offs in two settings. First, in a fixed confidence setting, we need to find the best arm with a given target success probability as quickly as possible. We propose an algorithm which trades off between information accumulation and throughput and show that the time taken can be upper bounded by the solution of a dynamic program whose inputs are the gaps between the sub-optimal and optimal arms. We also prove a matching hardness result. Second, we present an algorithm for a fixed deadline setting, where we are given a time deadline and need to maximize the probability of finding the best arm. We corroborate our theoretical insights with simulation experiments that show that the algorithms consistently match or outperform baseline algorithms on a variety of problem instances.},
 author = {Thananjeyan, Brijen and Kandasamy, Kirthevasan and Stoica, Ion and Jordan, Michael and Goldberg, Ken and Gonzalez, Joseph},
 bdsk-url-1 = {https://proceedings.mlr.press/v139/thananjeyan21a.html},
 booktitle = {Proceedings of the 38th International Conference on Machine Learning},
 editor = {Meila, Marina and Zhang, Tong},
 keywords = {peerrev, selected},
 month = {7},
 pages = {10236--10246},
 pdf = {http://proceedings.mlr.press/v139/thananjeyan21a/thananjeyan21a.pdf},
 publisher = {PMLR},
 series = {Proceedings of Machine Learning Research},
 title = {Resource Allocation in Multi-armed Bandit Exploration: Overcoming Sublinear Scaling with Adaptive Parallelism},
 url = {https://proceedings.mlr.press/v139/thananjeyan21a.html},
 volume = {139},
 year = {2021}
}

Ruizhe Cheng, Bichen Wu, Peizhao Zhang, Peter Vajda, and Joseph E. Gonzalez. "Data-Efficient Language-Supervised Zero-Shot Learning With Self-Distillation." Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops, 2021.

Traditional computer vision models are trained to predict a fixed set of predefined categories. Recently, natural language has been shown to be a broader and richer source of supervision that provides finer descriptions to visual concepts than supervised "gold" labels. Previous works, such as CLIP, use a simple pretraining task of predicting the pairings between images and text captions. CLIP, however, is data hungry and requires more than 400M image text pairs for training. We propose a data-efficient contrastive distillation method that uses soft labels to learn from noisy image-text pairs. Our model transfers knowledge from pretrained image and sentence encoders and achieves strong performance with only 3M image text pairs, 133x smaller than CLIP. Our method exceeds the previous SoTA of general zero-shot learning on ImageNet 21k+1k by $73\%$ relatively with a ResNet50 image encoder and DeCLUTR text encoder. We also beat CLIP by $10.5\%$ relatively on zero-shot evaluation on Google Open Images (19,958 classes).

@inproceedings{Cheng21,
 abstract = {Traditional computer vision models are trained to predict a fixed set of predefined categories. Recently, natural language has been shown to be a broader and richer source of supervision that provides finer descriptions to visual concepts than supervised "gold" labels. Previous works, such as CLIP, use a simple pretraining task of predicting the pairings between images and text captions. CLIP, however, is data hungry and requires more than 400M image text pairs for training. We propose a data-efficient contrastive distillation method that uses soft labels to learn from noisy image-text pairs. Our model transfers knowledge from pretrained image and sentence encoders and achieves strong performance with only 3M image text pairs, 133x smaller than CLIP. Our method exceeds the previous SoTA of general zero-shot learning on ImageNet 21k+1k by $73\%$ relatively with a ResNet50 image encoder and DeCLUTR text encoder. We also beat CLIP by $10.5\%$ relatively on zero-shot evaluation on Google Open Images (19,958 classes).},
 author = {Cheng, Ruizhe and Wu, Bichen and Zhang, Peizhao and Vajda, Peter and Gonzalez, Joseph E.},
 booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops},
 keywords = {peerrev, selected},
 month = {6},
 pages = {3119-3124},
 title = {Data-Efficient Language-Supervised Zero-Shot Learning With Self-Distillation},
 year = {2021}
}

Xiaoliang Dai, Alvin Wan, Peizhao Zhang, Bichen Wu, Zijian He, Zhen Wei, Kan Chen, Yuandong Tian, Matthew Yu, Peter Vajda, and Joseph E. Gonzalez. "FBNetV3: Joint Architecture-Recipe Search Using Predictor Pretraining." Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2021.

Neural Architecture Search (NAS) yields state-of-the-art neural networks that outperform their best manually-designed counterparts. However, previous NAS methods search for architectures under one set of training hyper-parameters (i.e., a training recipe), overlooking superior architecture-recipe combinations. To address this, we present Neural Architecture-Recipe Search (NARS) to search both (a) architectures and (b) their corresponding training recipes, simultaneously. NARS utilizes an accuracy predictor that scores architecture and training recipes jointly, guiding both sample selection and ranking. Furthermore, to compensate for the enlarged search space, we leverage "free" architecture statistics (e.g., FLOP count) to pretrain the predictor, significantly improving its sample efficiency and prediction reliability. After training the predictor via constrained iterative optimization, we run fast evolutionary searches in just CPU minutes to generate architecture-recipe pairs for a variety of resource constraints, called FBNetV3. FBNetV3 makes up a family of state-of-the-art compact neural networks that outperform both automatically and manually-designed competitors. For example, FBNetV3 matches both EfficientNet and ResNeSt accuracy on ImageNet with up to 2.0x and 7.1x fewer FLOPs, respectively. Furthermore, FBNetV3 yields significant performance gains for downstream object detection tasks, improving mAP despite $18\%$ fewer FLOPs and $34\%$ fewer parameters than EfficientNet-based equivalents.

@inproceedings{Dai21,
 abstract = {Neural Architecture Search (NAS) yields state-of-the-art neural networks that outperform their best manually-designed counterparts. However, previous NAS methods search for architectures under one set of training hyper-parameters (i.e., a training recipe), overlooking superior architecture-recipe combinations. To address this, we present Neural Architecture-Recipe Search (NARS) to search both (a) architectures and (b) their corresponding training recipes, simultaneously. NARS utilizes an accuracy predictor that scores architecture and training recipes jointly, guiding both sample selection and ranking. Furthermore, to compensate for the enlarged search space, we leverage "free" architecture statistics (e.g., FLOP count) to pretrain the predictor, significantly improving its sample efficiency and prediction reliability. After training the predictor via constrained iterative optimization, we run fast evolutionary searches in just CPU minutes to generate architecture-recipe pairs for a variety of resource constraints, called FBNetV3. FBNetV3 makes up a family of state-of-the-art compact neural networks that outperform both automatically and manually-designed competitors. For example, FBNetV3 matches both EfficientNet and ResNeSt accuracy on ImageNet with up to 2.0x and 7.1x fewer FLOPs, respectively. Furthermore, FBNetV3 yields significant performance gains for downstream object detection tasks, improving mAP despite $18\%$ fewer FLOPs and $34\%$ fewer parameters than EfficientNet-based equivalents.},
 author = {Dai, Xiaoliang and Wan, Alvin and Zhang, Peizhao and Wu, Bichen and He, Zijian and Wei, Zhen and Chen, Kan and Tian, Yuandong and Yu, Matthew and Vajda, Peter and Gonzalez, Joseph E.},
 booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
 keywords = {peerrev, selected},
 month = {6},
 pages = {16276-16285},
 title = {FBNetV3: Joint Architecture-Recipe Search Using Predictor Pretraining},
 year = {2021}
}

Samuel Paradis, Minho Hwang, Brijen Thananjeyan, Jeffrey Ichnowski, Daniel Seita, Danyal Fer, Thomas Low, Joseph E. Gonzalez, and Ken Goldberg. "Intermittent Visual Servoing: Efficiently Learning Policies Robust to Instrument Changes for High-precision Surgical Manipulation." 2021 IEEE International Conference on Robotics and Automation (ICRA), 2021.

Assisting surgeons with automation of surgical subtasks is challenging due to backlash, hysteresis, and variable tensioning in cable-driven robots. These issues are exacerbated as surgical instruments are changed during an operation. In this work, we propose a framework for automation of high- precision surgical subtasks by learning local, sample-efficient, accurate, closed-loop policies that use visual feedback instead of robot encoder estimates. This framework, which we call deep Intermittent Visual Servoing (IVS), switches to a learned visual servo policy for high-precision segments of repetitive surgical tasks while relying on a coarse open-loop policy for the segments where precision is not necessary. We train the policy using only 180 human demonstrations that are roughly 2 seconds each. Results on a da Vinci Research Kit suggest that combining the coarse policy with half a second of corrections from the learned policy during each high-precision segment improves the success rate on the Fundamentals of Laparoscopic Surgery peg transfer task from $72.9\%$ to $99.2\%$, $31.3\%$ to $99.2\%$, and $47.2\%$ to $100.0\%$ for 3 instruments with differing cable properties. In the contexts we studied, IVS attains the highest published success rates for automated surgical peg transfer and is significantly more reliable than previous techniques when instruments are changed. Supplementary material is available at https://tinyurl.com/ivs-icra.

@inproceedings{Paradis21,
 abstract = {Assisting surgeons with automation of surgical subtasks is challenging due to backlash, hysteresis, and variable tensioning in cable-driven robots. These issues are exacerbated as surgical instruments are changed during an operation. In this work, we propose a framework for automation of high- precision surgical subtasks by learning local, sample-efficient, accurate, closed-loop policies that use visual feedback instead of robot encoder estimates. This framework, which we call deep Intermittent Visual Servoing (IVS), switches to a learned visual servo policy for high-precision segments of repetitive surgical tasks while relying on a coarse open-loop policy for the segments where precision is not necessary. We train the policy using only 180 human demonstrations that are roughly 2 seconds each. Results on a da Vinci Research Kit suggest that combining the coarse policy with half a second of corrections from the learned policy during each high-precision segment improves the success rate on the Fundamentals of Laparoscopic Surgery peg transfer task from $72.9\%$ to $99.2\%$, $31.3\%$ to $99.2\%$, and $47.2\%$ to $100.0\%$ for 3 instruments with differing cable properties. In the contexts we studied, IVS attains the highest published success rates for automated surgical peg transfer and is significantly more reliable than previous techniques when instruments are changed. Supplementary material is available at https://tinyurl.com/ivs-icra.},
 author = {Paradis, Samuel and Hwang, Minho and Thananjeyan, Brijen and Ichnowski, Jeffrey and Seita, Daniel and Fer, Danyal and Low, Thomas and Gonzalez, Joseph E. and Goldberg, Ken},
 bdsk-url-1 = {https://arxiv.org/abs/2011.06163},
 bdsk-url-2 = {https://doi.org/10.1109/ICRA48506.2021.9561070},
 booktitle = {2021 IEEE International Conference on Robotics and Automation (ICRA)},
 code = {https://tinyurl.com/ivs-icra.},
 doi = {10.1109/ICRA48506.2021.9561070},
 issn = {2577-087X},
 keywords = {peerrev, selected},
 month = {5},
 pages = {7166-7173},
 title = {Intermittent Visual Servoing: Efficiently Learning Policies Robust to Instrument Changes for High-precision Surgical Manipulation},
 url = {https://arxiv.org/abs/2011.06163},
 year = {2021}
}

Aditya Ganapathi, Priya Sundaresan, Brijen Thananjeyan, Ashwin Balakrishna, Daniel Seita, Jennifer Grannen, Minho Hwang, Ryan Hoque, Joseph E. Gonzalez, Nawid Jamali, Katsu Yamane, Soshi Iba, and Ken Goldberg. "Learning Dense Visual Correspondences in Simulation to Smooth and Fold Real Fabrics." 2021 IEEE International Conference on Robotics and Automation (ICRA), 2021.

Robotic fabric manipulation is challenging due to the infinite dimensional configuration space, self-occlusion, and complex dynamics of fabrics. There has been significant prior work on learning policies for specific fabric manipulation tasks, but comparatively less focus on algorithms which can perform many different tasks. We take a step towards this goal by learning point-pair correspondences across different fabric configurations in simulation. Then, given a single demonstration of a new task from an initial fabric configuration, these correspondences can be used to compute geometrically equivalent actions in a new fabric configuration. This makes it possible to define policies to robustly imitate a broad set of multi-step fabric smoothing and folding tasks. The resulting policies achieve $80.3\%$ average task success rate across 10 fabric manipulation tasks on two different physical robotic systems. Results also suggest robustness to fabrics of various colors, sizes, and shapes. See https://tinyurl.com/fabric-descriptors for supplementary material and videos.

@inproceedings{Ganapathi21,
 abstract = {Robotic fabric manipulation is challenging due to the infinite dimensional configuration space, self-occlusion, and complex dynamics of fabrics. There has been significant prior work on learning policies for specific fabric manipulation tasks, but comparatively less focus on algorithms which can perform many different tasks. We take a step towards this goal by learning point-pair correspondences across different fabric configurations in simulation. Then, given a single demonstration of a new task from an initial fabric configuration, these correspondences can be used to compute geometrically equivalent actions in a new fabric configuration. This makes it possible to define policies to robustly imitate a broad set of multi-step fabric smoothing and folding tasks. The resulting policies achieve $80.3\%$ average task success rate across 10 fabric manipulation tasks on two different physical robotic systems. Results also suggest robustness to fabrics of various colors, sizes, and shapes. See https://tinyurl.com/fabric-descriptors for supplementary material and videos.},
 author = {Ganapathi, Aditya and Sundaresan, Priya and Thananjeyan, Brijen and Balakrishna, Ashwin and Seita, Daniel and Grannen, Jennifer and Hwang, Minho and Hoque, Ryan and Gonzalez, Joseph E. and Jamali, Nawid and Yamane, Katsu and Iba, Soshi and Goldberg, Ken},
 bdsk-url-1 = {https://arxiv.org/abs/2003.12698},
 bdsk-url-2 = {https://doi.org/10.1109/ICRA48506.2021.9561980},
 booktitle = {2021 IEEE International Conference on Robotics and Automation (ICRA)},
 code = {https://tinyurl.com/fabric-descriptors},
 doi = {10.1109/ICRA48506.2021.9561980},
 issn = {2577-087X},
 keywords = {peerrev, selected},
 month = {5},
 pages = {11515-11522},
 title = {Learning Dense Visual Correspondences in Simulation to Smooth and Fold Real Fabrics},
 url = {https://arxiv.org/abs/2003.12698},
 year = {2021}
}

Ionel Gog, Sukrit Kalra, Peter Schafhalter, Matthew A. Wright, Joseph E. Gonzalez, and Ion Stoica. "Pylot: A Modular Platform for Exploring Latency-Accuracy Tradeoffs in Autonomous Vehicles." 2021 IEEE International Conference on Robotics and Automation (ICRA), 2021.

We present Pylot, a platform for autonomous vehicle (AV) research and development, built with the goal to allow researchers to study the effects of the latency and accuracy of their models and algorithms on the end-to-end driving behavior of an AV. This is achieved through a modular structure enabled by our high-performance dataflow system that represents AV software pipeline components (object detectors, motion planners, etc.) as a dataflow graph of operators which communicate on data streams using timestamped messages. Pylot readily interfaces with popular AV simulators like CARLA, and is easily deployable to real-world vehicles with minimal code changes.To reduce the burden of developing an entire pipeline for evaluating a single component, Pylot provides several state-of-the-art reference implementations for the various components of an AV pipeline. Using these reference implementations, a Pylot-based AV pipeline is able to drive a real vehicle, and attains a high score on the CARLA Autonomous Driving Challenge. We also present several case studies enabled by Pylot, including evidence of a need for context-dependent components, and per-component time allocation. Pylot is open source, with the code available at https://github.com/erdos-project/pylot.

@inproceedings{Gog21,
 abstract = {We present Pylot, a platform for autonomous vehicle (AV) research and development, built with the goal to allow researchers to study the effects of the latency and accuracy of their models and algorithms on the end-to-end driving behavior of an AV. This is achieved through a modular structure enabled by our high-performance dataflow system that represents AV software pipeline components (object detectors, motion planners, etc.) as a dataflow graph of operators which communicate on data streams using timestamped messages. Pylot readily interfaces with popular AV simulators like CARLA, and is easily deployable to real-world vehicles with minimal code changes.To reduce the burden of developing an entire pipeline for evaluating a single component, Pylot provides several state-of-the-art reference implementations for the various components of an AV pipeline. Using these reference implementations, a Pylot-based AV pipeline is able to drive a real vehicle, and attains a high score on the CARLA Autonomous Driving Challenge. We also present several case studies enabled by Pylot, including evidence of a need for context-dependent components, and per-component time allocation. Pylot is open source, with the code available at https://github.com/erdos-project/pylot.},
 author = {Gog, Ionel and Kalra, Sukrit and Schafhalter, Peter and Wright, Matthew A. and Gonzalez, Joseph E. and Stoica, Ion},
 bdsk-url-1 = {https://arxiv.org/abs/2104.07830},
 bdsk-url-2 = {https://doi.org/10.1109/ICRA48506.2021.9561747},
 booktitle = {2021 IEEE International Conference on Robotics and Automation (ICRA)},
 code = {https://github.com/erdos-project/pylot.},
 doi = {10.1109/ICRA48506.2021.9561747},
 issn = {2577-087X},
 keywords = {peerrev, selected},
 month = {5},
 pages = {8806-8813},
 title = {Pylot: A Modular Platform for Exploring Latency-Accuracy Tradeoffs in Autonomous Vehicles},
 url = {https://arxiv.org/abs/2104.07830},
 year = {2021}
}

Raghav Anand, Jeffrey Ichnowski, Chenggang Wu, Joseph M. Hellerstein, Joseph E. Gonzalez, and Ken Goldberg. "Serverless Multi-Query Motion Planning for Fog Robotics." 2021 IEEE International Conference on Robotics and Automation (ICRA), 2021.

Robots in semi-structured environments such as homes and warehouses sporadically require computation of high-dimensional motion plans. Cloud and fog-based parallelization of motion planning can speed up planning. This can be further made efficient by the use of "serverless" on-demand computing as opposed to always-on high end computers. This paper explores parallelizing the computation of a sampling-based multi-query motion planner based on asymptotically-optimal Probabilistic Road Maps (PRM*) using the simultaneous execution of 100s of cloud-based serverless functions. We propose an algorithm to overcome the communication and bandwidth limitations of serverless computing and use different work-sharing techniques to further optimize the cost and run time. Additionally, we provide proofs of probabilistic completeness and asymptotic optimality. In experiments on synthetic benchmarks and on a physical Fetch robot performing a sequence of decluttering motions, we observe up to a 50x speedup relative to a 4 core edge computer with only a marginally higher cost.

@inproceedings{Anand21,
 abstract = {Robots in semi-structured environments such as homes and warehouses sporadically require computation of high-dimensional motion plans. Cloud and fog-based parallelization of motion planning can speed up planning. This can be further made efficient by the use of "serverless" on-demand computing as opposed to always-on high end computers. This paper explores parallelizing the computation of a sampling-based multi-query motion planner based on asymptotically-optimal Probabilistic Road Maps (PRM*) using the simultaneous execution of 100s of cloud-based serverless functions. We propose an algorithm to overcome the communication and bandwidth limitations of serverless computing and use different work-sharing techniques to further optimize the cost and run time. Additionally, we provide proofs of probabilistic completeness and asymptotic optimality. In experiments on synthetic benchmarks and on a physical Fetch robot performing a sequence of decluttering motions, we observe up to a 50x speedup relative to a 4 core edge computer with only a marginally higher cost.},
 author = {Anand, Raghav and Ichnowski, Jeffrey and Wu, Chenggang and Hellerstein, Joseph M. and Gonzalez, Joseph E. and Goldberg, Ken},
 bdsk-url-1 = {https://goldberg.berkeley.edu/pubs/ICRA21-ichnowski-serverless-motion-planning-submitted.pdf},
 bdsk-url-2 = {https://doi.org/10.1109/ICRA48506.2021.9561571},
 booktitle = {2021 IEEE International Conference on Robotics and Automation (ICRA)},
 doi = {10.1109/ICRA48506.2021.9561571},
 issn = {2577-087X},
 keywords = {peerrev, selected},
 month = {5},
 pages = {7457-7463},
 title = {Serverless Multi-Query Motion Planning for Fog Robotics},
 url = {https://goldberg.berkeley.edu/pubs/ICRA21-ichnowski-serverless-motion-planning-submitted.pdf},
 year = {2021}
}

Anand Padmanabha Iyer, Qifan Pu, Kishan Patel, Joseph E. Gonzalez, and Ion Stoica. "TEGRA: Efficient Ad-Hoc Analytics on Evolving Graphs." 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21), 2021.

Several emerging evolving graph application workloads demand support for efficient ad-hoc analytics---the ability to perform ad-hoc queries on arbitrary time windows of the graph. We present TEGRA, a system that enables efficient ad-hoc window operations on evolving graphs. TEGRA allows efficient access to the state of the graph at arbitrary windows, and significantly accelerates ad-hoc window queries by using a compact in-memory representation for both graph and intermediate computation state. For this, it leverages persistent data structures to build a versioned, distributed graph state store, and couples it with an incremental computation model which can leverage these compact states. For users, it exposes these compact states using Timelapse, a natural abstraction. We evaluate TEGRA against existing evolving graph analysis techniques, and show that it significantly outperforms state-of-the-art systems (by up to 30×) for ad-hoc window operation workloads.

@inproceedings{Iyer21,
 abstract = {Several emerging evolving graph application workloads demand support for efficient ad-hoc analytics---the ability to perform ad-hoc queries on arbitrary time windows of the graph. We present TEGRA, a system that enables efficient ad-hoc window operations on evolving graphs. TEGRA allows efficient access to the state of the graph at arbitrary windows, and significantly accelerates ad-hoc window queries by using a compact in-memory representation for both graph and intermediate computation state. For this, it leverages persistent data structures to build a versioned, distributed graph state store, and couples it with an incremental computation model which can leverage these compact states. For users, it exposes these compact states using Timelapse, a natural abstraction. We evaluate TEGRA against existing evolving graph analysis techniques, and show that it significantly outperforms state-of-the-art systems (by up to 30×) for ad-hoc window operation workloads.},
 author = {Anand Padmanabha Iyer and Qifan Pu and Kishan Patel and Joseph E. Gonzalez and Ion Stoica},
 bdsk-url-1 = {https://www.usenix.org/conference/nsdi21/presentation/iyer},
 booktitle = {18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)},
 isbn = {978-1-939133-21-2},
 keywords = {peerrev, selected},
 month = {4},
 pages = {337--355},
 publisher = {USENIX Association},
 title = { {TEGRA}: Efficient {Ad-Hoc} Analytics on Evolving Graphs},
 url = {https://www.usenix.org/conference/nsdi21/presentation/iyer},
 year = {2021}
}

Johann Schleier-Smith, Vikram Sreekanti, Anurag Khandelwal, Joao Carreira, Neeraja J. Yadwadkar, Raluca Ada Popa, Joseph E. Gonzalez, Ion Stoica, and David A. Patterson. "What Serverless Computing is and Should Become: The next Phase of Cloud Computing." Commun. ACM, 2021.

The evolution that serverless computing represents, the economic forces that shape it, why it could fail, and how it might fulfill its potential.

@article{SchleierSmith21,
 abstract = {The evolution that serverless computing represents, the economic forces that shape it, why it could fail, and how it might fulfill its potential.},
 address = {New York, NY, USA},
 author = {Schleier-Smith, Johann and Sreekanti, Vikram and Khandelwal, Anurag and Carreira, Joao and Yadwadkar, Neeraja J. and Popa, Raluca Ada and Gonzalez, Joseph E. and Stoica, Ion and Patterson, David A.},
 bdsk-url-1 = {https://doi.org/10.1145/3406011},
 doi = {10.1145/3406011},
 issn = {0001-0782},
 issue_date = {May 2021},
 journal = {Commun. ACM},
 keywords = {peerrev, selected},
 month = {4},
 number = {5},
 numpages = {9},
 pages = {76--84},
 publisher = {Association for Computing Machinery},
 title = {What Serverless Computing is and Should Become: The next Phase of Cloud Computing},
 url = {https://doi.org/10.1145/3406011},
 volume = {64},
 year = {2021}
}

Zhengming Zhang, Yaoqing Yang, Zhewei Yao, Yujun Yan, Joseph E. Gonzalez, Kannan Ramchandran, and Michael W. Mahoney. "Improving Semi-supervised Federated Learning by Reducing the Gradient Diversity of Models." 2021 IEEE International Conference on Big Data (Big Data), 2021.

Federated learning (FL) is a promising way to use the computing power of mobile devices while maintaining the privacy of users. Current work in FL, however, makes the unrealistic assumption that the users have ground-truth labels on their devices, while also assuming that the server has neither data nor labels. In this work, we consider the more realistic scenario where the users have only unlabeled data, while the server has some labeled data, and where the amount of labeled data is smaller than the amount of unlabeled data. We call this learning problem semi-supervised federated learning (SSFL). For SSFL, we demonstrate that a critical issue that affects the test accuracy is the large gradient diversity of the models from different users. Based on this, we investigate several design choices. First, we find that the so-called consistency regularization loss (CRL), which is widely used in semi-supervised learning, performs reasonably well but has large gradient diversity. Second, we find that Batch Normalization (BN) increases gradient diversity. Replacing BN with the recently-proposed Group Normalization (GN) can reduce gradient diversity and improve test accuracy. Third, we show that CRL combined with GN still has a large gradient diversity when the number of users is large. Based on these results, we propose a novel grouping-based model averaging method to replace the FedAvg averaging method. Overall, our grouping-based averaging, combined with GN and CRL, achieves better test accuracy than not just a contemporary paper on SSFL in the same settings (>10\%), but also four supervised FL algorithms.

@inproceedings{Zhengming22,
 abstract = {Federated learning (FL) is a promising way to use the computing power of mobile devices while maintaining the privacy of users. Current work in FL, however, makes the unrealistic assumption that the users have ground-truth labels on their devices, while also assuming that the server has neither data nor labels. In this work, we consider the more realistic scenario where the users have only unlabeled data, while the server has some labeled data, and where the amount of labeled data is smaller than the amount of unlabeled data. We call this learning problem semi-supervised federated learning (SSFL). For SSFL, we demonstrate that a critical issue that affects the test accuracy is the large gradient diversity of the models from different users. Based on this, we investigate several design choices. First, we find that the so-called consistency regularization loss (CRL), which is widely used in semi-supervised learning, performs reasonably well but has large gradient diversity. Second, we find that Batch Normalization (BN) increases gradient diversity. Replacing BN with the recently-proposed Group Normalization (GN) can reduce gradient diversity and improve test accuracy. Third, we show that CRL combined with GN still has a large gradient diversity when the number of users is large. Based on these results, we propose a novel grouping-based model averaging method to replace the FedAvg averaging method. Overall, our grouping-based averaging, combined with GN and CRL, achieves better test accuracy than not just a contemporary paper on SSFL in the same settings (>10\%), but also four supervised FL algorithms.},
 author = {Zhang, Zhengming and Yang, Yaoqing and Yao, Zhewei and Yan, Yujun and Gonzalez, Joseph E. and Ramchandran, Kannan and Mahoney, Michael W.},
 bdsk-url-1 = {https://doi.org/10.1109/BigData52589.2021.9671693},
 booktitle = {2021 IEEE International Conference on Big Data (Big Data)},
 doi = {10.1109/BigData52589.2021.9671693},
 keywords = {peerrev, selected},
 month = {12},
 pages = {1214-1225},
 title = {Improving Semi-supervised Federated Learning by Reducing the Gradient Diversity of Models},
 year = {2021}
}

Devin Petersohn, Dixin Tang, Rehan Durrani, Areg Melik-Adamyan, Joseph E. Gonzalez, Anthony D. Joseph, and Aditya G. Parameswaran. "Flexible Rule-Based Decomposition and Metadata Independence in Modin: A Parallel Dataframe System." Proc. VLDB Endow., 2021.

Dataframes have become universally popular as a means to represent data in various stages of structure, and manipulate it using a rich set of operators---thereby becoming an essential tool in the data scientists' toolbox. However, dataframe systems, such as pandas, scale poorly---and are non-interactive on moderate to large datasets. We discuss our experiences developing Modin, our first cut at a parallel dataframe system, which already has users across several industries and over 1M downloads. Modin translates pandas functions into a core set of operators that are individually parallelized via columnar, row-wise, or cell-wise decomposition rules that we formalize in this paper. We also introduce metadata independence to allow metadata---such as order and type---to be decoupled from the physical representation and maintained lazily. Using rule-based decomposition and metadata independence, along with careful engineering, Modin is able to support pandas operations across both rows and columns on very large dataframes---unlike Koalas and Dask DataFrames that either break down or are unable to support such operations, while also being much faster than pandas.

@article{Modin21,
 abstract = {Dataframes have become universally popular as a means to represent data in various stages of structure, and manipulate it using a rich set of operators---thereby becoming an essential tool in the data scientists' toolbox. However, dataframe systems, such as pandas, scale poorly---and are non-interactive on moderate to large datasets. We discuss our experiences developing Modin, our first cut at a parallel dataframe system, which already has users across several industries and over 1M downloads. Modin translates pandas functions into a core set of operators that are individually parallelized via columnar, row-wise, or cell-wise decomposition rules that we formalize in this paper. We also introduce metadata independence to allow metadata---such as order and type---to be decoupled from the physical representation and maintained lazily. Using rule-based decomposition and metadata independence, along with careful engineering, Modin is able to support pandas operations across both rows and columns on very large dataframes---unlike Koalas and Dask DataFrames that either break down or are unable to support such operations, while also being much faster than pandas.},
 author = {Petersohn, Devin and Tang, Dixin and Durrani, Rehan and Melik-Adamyan, Areg and Gonzalez, Joseph E. and Joseph, Anthony D. and Parameswaran, Aditya G.},
 bdsk-url-1 = {https://doi.org/10.14778/3494124.3494152},
 doi = {10.14778/3494124.3494152},
 issn = {2150-8097},
 issue_date = {November 2021},
 journal = {Proc. VLDB Endow.},
 keywords = {peerrev, selected},
 month = {11},
 number = {3},
 numpages = {13},
 pages = {739--751},
 publisher = {VLDB Endowment},
 title = {Flexible Rule-Based Decomposition and Metadata Independence in Modin: A Parallel Dataframe System},
 url = {https://doi.org/10.14778/3494124.3494152},
 volume = {15},
 year = {2021}
}

J. Weston Hughes, Jeffrey E. Olgin, Robert Avram, Sean A. Abreau, Taylor Sittler, Kaahan Radia, Henry Hsia, Tomos Walters, Byron Lee, Joseph E. Gonzalez, and Geoffrey H. Tison. "Performance of a Convolutional Neural Network and Explainability Technique for 12-Lead Electrocardiogram Interpretation." JAMA Cardiology, 2021.

{Millions of clinicians rely daily on automated preliminary electrocardiogram (ECG) interpretation. Critical comparisons of machine learning--based automated analysis against clinically accepted standards of care are lacking.To use readily available 12-lead ECG data to train and apply an explainability technique to a convolutional neural network (CNN) that achieves high performance against clinical standards of care.This cross-sectional study was conducted using data from January 1, 2003, to December 31, 2018. Data were obtained in a commonly available 12-lead ECG format from a single-center tertiary care institution. All patients aged 18 years or older who received ECGs at the University of California, San Francisco, were included, yielding a total of 365009 patients. Data were analyzed from January 1, 2019, to March 2, 2021.A CNN was trained to predict the presence of 38 diagnostic classes in 5 categories from 12-lead ECG data. A CNN explainability technique called LIME (Linear Interpretable Model-Agnostic Explanations) was used to visualize ECG segments contributing to CNN diagnoses.Area under the receiver operating characteristic curve (AUC), sensitivity, and specificity were calculated for the CNN in the holdout test data set against cardiologist clinical diagnoses. For a second validation, 3 electrophysiologists provided consensus committee diagnoses against which the CNN, cardiologist clinical diagnosis, and MUSE (GE Healthcare) automated analysis performance was compared using the F1 score; AUC, sensitivity, and specificity were also calculated for the CNN against the consensus committee.A total of 992748 ECGs from 365009 adult patients (mean [SD] age, 56.2 [17.6] years; 183600 women $[50.3\%]$; and 175277 White patients $[48.0\%]$) were included in the analysis. In 91440 test data set ECGs, the CNN demonstrated an AUC of at least 0.960 for 32 of 38 classes ($84.2\%$). Against the consensus committee diagnoses, the CNN had higher frequency-weighted mean F1 scores than both cardiologists and MUSE in all 5 categories (CNN frequency-weighted F1 score for rhythm, 0.812; conduction, 0.729; chamber diagnosis, 0.598; infarct, 0.674; and other diagnosis, 0.875). For 32 of 38 classes ($84.2\%$), the CNN had AUCs of at least 0.910 and demonstrated comparable F1 scores and higher sensitivity than cardiologists, except for atrial fibrillation (CNN F1 score, 0.847 vs cardiologist F1 score, 0.881), junctional rhythm (0.526 vs 0.727), premature ventricular complex (0.786 vs 0.800), and Wolff-Parkinson-White (0.800 vs 0.842). Compared with MUSE, the CNN had higher F1 scores for all classes except supraventricular tachycardia (CNN F1 score, 0.696 vs MUSE F1 score, 0.714). The LIME technique highlighted physiologically relevant ECG segments.The results of this cross-sectional study suggest that readily available ECG data can be used to train a CNN algorithm to achieve comparable performance to clinical cardiologists and exceed the performance of MUSE automated analysis for most diagnoses, with some exceptions. The LIME explainability technique applied to CNNs highlights physiologically relevant ECG segments that contribute to the CNN's diagnoses.}

@article{Hughes21,
 abstract = { {Millions of clinicians rely daily on automated preliminary electrocardiogram (ECG) interpretation. Critical comparisons of machine learning--based automated analysis against clinically accepted standards of care are lacking.To use readily available 12-lead ECG data to train and apply an explainability technique to a convolutional neural network (CNN) that achieves high performance against clinical standards of care.This cross-sectional study was conducted using data from January 1, 2003, to December 31, 2018. Data were obtained in a commonly available 12-lead ECG format from a single-center tertiary care institution. All patients aged 18 years or older who received ECGs at the University of California, San Francisco, were included, yielding a total of 365009 patients. Data were analyzed from January 1, 2019, to March 2, 2021.A CNN was trained to predict the presence of 38 diagnostic classes in 5 categories from 12-lead ECG data. A CNN explainability technique called LIME (Linear Interpretable Model-Agnostic Explanations) was used to visualize ECG segments contributing to CNN diagnoses.Area under the receiver operating characteristic curve (AUC), sensitivity, and specificity were calculated for the CNN in the holdout test data set against cardiologist clinical diagnoses. For a second validation, 3 electrophysiologists provided consensus committee diagnoses against which the CNN, cardiologist clinical diagnosis, and MUSE (GE Healthcare) automated analysis performance was compared using the F1 score; AUC, sensitivity, and specificity were also calculated for the CNN against the consensus committee.A total of 992748 ECGs from 365009 adult patients (mean [SD] age, 56.2 [17.6] years; 183600 women $[50.3\%]$; and 175277 White patients $[48.0\%]$) were included in the analysis. In 91440 test data set ECGs, the CNN demonstrated an AUC of at least 0.960 for 32 of 38 classes ($84.2\%$). Against the consensus committee diagnoses, the CNN had higher frequency-weighted mean F1 scores than both cardiologists and MUSE in all 5 categories (CNN frequency-weighted F1 score for rhythm, 0.812; conduction, 0.729; chamber diagnosis, 0.598; infarct, 0.674; and other diagnosis, 0.875). For 32 of 38 classes ($84.2\%$), the CNN had AUCs of at least 0.910 and demonstrated comparable F1 scores and higher sensitivity than cardiologists, except for atrial fibrillation (CNN F1 score, 0.847 vs cardiologist F1 score, 0.881), junctional rhythm (0.526 vs 0.727), premature ventricular complex (0.786 vs 0.800), and Wolff-Parkinson-White (0.800 vs 0.842). Compared with MUSE, the CNN had higher F1 scores for all classes except supraventricular tachycardia (CNN F1 score, 0.696 vs MUSE F1 score, 0.714). The LIME technique highlighted physiologically relevant ECG segments.The results of this cross-sectional study suggest that readily available ECG data can be used to train a CNN algorithm to achieve comparable performance to clinical cardiologists and exceed the performance of MUSE automated analysis for most diagnoses, with some exceptions. The LIME explainability technique applied to CNNs highlights physiologically relevant ECG segments that contribute to the CNN's diagnoses.} },
 author = {Hughes, J. Weston and Olgin, Jeffrey E. and Avram, Robert and Abreau, Sean A. and Sittler, Taylor and Radia, Kaahan and Hsia, Henry and Walters, Tomos and Lee, Byron and Gonzalez, Joseph E. and Tison, Geoffrey H.},
 bdsk-url-1 = {https://doi.org/10.1001/jamacardio.2021.2746},
 doi = {10.1001/jamacardio.2021.2746},
 eprint = {https://jamanetwork.com/journals/jamacardiology/articlepdf/2782549/jamacardiology\_hughes\_2021\_oi\_210051\_1635348688.2775.pdf},
 issn = {2380-6583},
 journal = {JAMA Cardiology},
 keywords = {peerrev, selected},
 month = {11},
 number = {11},
 pages = {1285-1295},
 title = { {Performance of a Convolutional Neural Network and Explainability Technique for 12-Lead Electrocardiogram Interpretation} },
 url = {https://doi.org/10.1001/jamacardio.2021.2746},
 volume = {6},
 year = {2021}
}

Xin Wang, Thomas E. Huang, Benlin Liu, Fisher Yu, Xiaolong Wang, Joseph E. Gonzalez, and Trevor Darrell. "Robust Object Detection via Instance-Level Temporal Cycle Confusion." Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), 2021.

Building reliable object detectors that are robust to domain shifts, such as various changes in context, viewpoint, and object appearances, is critical for real-world applications. In this work, we study the effectiveness of auxiliary self-supervised tasks to improve the out-of-distribution generalization of object detectors. Inspired by the principle of maximum entropy, we introduce a novel self-supervised task, instance-level temporal cycle confusion (CycConf), which operates on the region features of the object detectors. For each object, the task is to find the most different object proposals in the adjacent frame in a video and then cycle back to itself for self-supervision. CycConf encourages the object detector to explore invariant structures across instances under various motions, which leads to improved model robustness in unseen domains at test time. We observe consistent out-of-domain performance improvements when training object detectors in tandem with self-supervised tasks on various domain adaptation benchmarks with static images (Cityscapes, Foggy Cityscapes, Sim10K) and large-scale video datasets (BDD100K and Waymo open data). The code and models are released at https://xinw.ai/cyc-conf.

@inproceedings{Wang21,
 abstract = {Building reliable object detectors that are robust to domain shifts, such as various changes in context, viewpoint, and object appearances, is critical for real-world applications. In this work, we study the effectiveness of auxiliary self-supervised tasks to improve the out-of-distribution generalization of object detectors. Inspired by the principle of maximum entropy, we introduce a novel self-supervised task, instance-level temporal cycle confusion (CycConf), which operates on the region features of the object detectors. For each object, the task is to find the most different object proposals in the adjacent frame in a video and then cycle back to itself for self-supervision. CycConf encourages the object detector to explore invariant structures across instances under various motions, which leads to improved model robustness in unseen domains at test time. We observe consistent out-of-domain performance improvements when training object detectors in tandem with self-supervised tasks on various domain adaptation benchmarks with static images (Cityscapes, Foggy Cityscapes, Sim10K) and large-scale video datasets (BDD100K and Waymo open data). The code and models are released at https://xinw.ai/cyc-conf.},
 author = {Wang, Xin and Huang, Thomas E. and Liu, Benlin and Yu, Fisher and Wang, Xiaolong and Gonzalez, Joseph E. and Darrell, Trevor},
 booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
 code = {https://xinw.ai/cyc-conf},
 keywords = {peerrev, selected},
 month = {10},
 pages = {9143-9152},
 title = {Robust Object Detection via Instance-Level Temporal Cycle Confusion},
 year = {2021}
}

Bichen Wu, Chenfeng Xu, Xiaoliang Dai, Alvin Wan, Peizhao Zhang, Zhicheng Yan, Masayoshi Tomizuka, Joseph E. Gonzalez, Kurt Keutzer, and Peter Vajda. "Visual Transformers: Where Do Transformers Really Belong in Vision Models?." Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), 2021.

A recent trend in computer vision is to replace convolutions with transformers. However, the performance gain of transformers is attained at a steep cost, requiring GPU years and hundreds of millions of samples for training. This excessive resource usage compensates for a misuse of transformers: Transformers densely model relationships between its inputs -- ideal for late stages of a neural network, when concepts are sparse and spatially-distant, but extremely inefficient for early stages of a network, when patterns are redundant and localized. To address these issues, we leverage the respective strengths of both operations, building convolution-transformer hybrids. Critically, in sharp contrast to pixel-space transformers, our Visual Transformer (VT) operates in a semantic token space, judiciously attending to different image parts based on context. Our VTs significantly outperforms baselines: On ImageNet, our VT-ResNets outperform convolution-only ResNet by 4.6 to 7 points and transformer-only ViT-B by 2.6 points with 2.5 times fewer FLOPs, 2.1 times fewer parameters. For semantic segmentation on LIP and COCO-stuff, VT-based feature pyramid networks (FPN) achieve 0.35 points higher mIoU while reducing the FPN module's FLOPs by 6.5x.

@inproceedings{Wu21,
 abstract = {A recent trend in computer vision is to replace convolutions with transformers. However, the performance gain of transformers is attained at a steep cost, requiring GPU years and hundreds of millions of samples for training. This excessive resource usage compensates for a misuse of transformers: Transformers densely model relationships between its inputs -- ideal for late stages of a neural network, when concepts are sparse and spatially-distant, but extremely inefficient for early stages of a network, when patterns are redundant and localized. To address these issues, we leverage the respective strengths of both operations, building convolution-transformer hybrids. Critically, in sharp contrast to pixel-space transformers, our Visual Transformer (VT) operates in a semantic token space, judiciously attending to different image parts based on context. Our VTs significantly outperforms baselines: On ImageNet, our VT-ResNets outperform convolution-only ResNet by 4.6 to 7 points and transformer-only ViT-B by 2.6 points with 2.5 times fewer FLOPs, 2.1 times fewer parameters. For semantic segmentation on LIP and COCO-stuff, VT-based feature pyramid networks (FPN) achieve 0.35 points higher mIoU while reducing the FPN module's FLOPs by 6.5x.},
 author = {Wu, Bichen and Xu, Chenfeng and Dai, Xiaoliang and Wan, Alvin and Zhang, Peizhao and Yan, Zhicheng and Tomizuka, Masayoshi and Gonzalez, Joseph E. and Keutzer, Kurt and Vajda, Peter},
 booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
 keywords = {peerrev, selected},
 month = {10},
 pages = {599-609},
 title = {Visual Transformers: Where Do Transformers Really Belong in Vision Models?},
 year = {2021}
}

Jeffrey Ichnowski, Paras Jain, Bartolomeo Stellato, Goran Banjac, Michael Luo, Francesco Borrelli, Joseph E Gonzalez, Ion Stoica, and Ken Goldberg. "Accelerating Quadratic Optimization with Reinforcement Learning." Advances in Neural Information Processing Systems, 2021.

First-order methods for quadratic optimization such as OSQP are widely used for large-scale machine learning and embedded optimal control, where many related problems must be rapidly solved. These methods face two persistent challenges: manual hyperparameter tuning and convergence time to high-accuracy solutions. To address these, we explore how Reinforcement Learning (RL) can learn a policy to tune parameters to accelerate convergence. In experiments with well-known QP benchmarks we find that our RL policy, RLQP, significantly outperforms state-of-the-art QP solvers by up to 3x. RLQP generalizes surprisingly well to previously unseen problems with varying dimension and structure from different applications, including the QPLIB, Netlib LP and Maros-M{\'e}sz{\'a}ros problems. Code, models, and videos are available at \url{https://berkeleyautomation.github.io/rlqp/}.

@inproceedings{NEURIPS2021_afdec700,
 abstract = {First-order methods for quadratic optimization such as OSQP are widely used for large-scale machine learning and embedded optimal control, where many related problems must be rapidly solved. These methods face two persistent challenges: manual hyperparameter tuning and convergence time to high-accuracy solutions. To address these, we explore how Reinforcement Learning (RL) can learn a policy to tune parameters to accelerate convergence. In experiments with well-known QP benchmarks we find that our RL policy, RLQP, significantly outperforms state-of-the-art QP solvers by up to 3x. RLQP generalizes surprisingly well to previously unseen problems with varying dimension and structure from different applications, including the QPLIB, Netlib LP and Maros-M{\'e}sz{\'a}ros problems. Code, models, and videos are available at \url{https://berkeleyautomation.github.io/rlqp/}.},
 author = {Ichnowski, Jeffrey and Jain, Paras and Stellato, Bartolomeo and Banjac, Goran and Luo, Michael and Borrelli, Francesco and Gonzalez, Joseph E and Stoica, Ion and Goldberg, Ken},
 bdsk-url-1 = {https://proceedings.neurips.cc/paper/2021/file/afdec7005cc9f14302cd0474fd0f3c96-Paper.pdf},
 booktitle = {Advances in Neural Information Processing Systems},
 code = {https://berkeleyautomation.github.io/rlqp/},
 editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan},
 keywords = {peerrev, selected},
 pages = {21043--21055},
 publisher = {Curran Associates, Inc.},
 title = {Accelerating Quadratic Optimization with Reinforcement Learning},
 url = {https://proceedings.neurips.cc/paper/2021/file/afdec7005cc9f14302cd0474fd0f3c96-Paper.pdf},
 volume = {34},
 year = {2021}
}

Tianjun Zhang, Benjamin Eysenbach, Ruslan Salakhutdinov, Sergey Levine, and Joseph E. Gonzalez. "C-Planning: An Automatic Curriculum for Learning Goal-Reaching Tasks." arXiv, 2021.

Goal-conditioned reinforcement learning (RL) can solve tasks in a wide range of domains, including navigation and manipulation, but learning to reach distant goals remains a central challenge to the field. Learning to reach such goals is particularly hard without any offline data, expert demonstrations, and reward shaping. In this paper, we propose an algorithm to solve the distant goal-reaching task by using search at training time to automatically generate a curriculum of intermediate states. Our algorithm, Classifier-Planning (C-Planning), frames the learning of the goal-conditioned policies as expectation maximization: the E-step corresponds to planning an optimal sequence of waypoints using graph search, while the M-step aims to learn a goal-conditioned policy to reach those waypoints. Unlike prior methods that combine goal-conditioned RL with graph search, ours performs search only during training and not testing, significantly decreasing the compute costs of deploying the learned policy. Empirically, we demonstrate that our method is more sample efficient than prior methods. Moreover, it is able to solve very long horizons manipulation and navigation tasks, tasks that prior goal-conditioned methods and methods based on graph search fail to solve.

@misc{cplanning21,
 abstract = {Goal-conditioned reinforcement learning (RL) can solve tasks in a wide range of domains, including navigation and manipulation, but learning to reach distant goals remains a central challenge to the field. Learning to reach such goals is particularly hard without any offline data, expert demonstrations, and reward shaping. In this paper, we propose an algorithm to solve the distant goal-reaching task by using search at training time to automatically generate a curriculum of intermediate states. Our algorithm, Classifier-Planning (C-Planning), frames the learning of the goal-conditioned policies as expectation maximization: the E-step corresponds to planning an optimal sequence of waypoints using graph search, while the M-step aims to learn a goal-conditioned policy to reach those waypoints. Unlike prior methods that combine goal-conditioned RL with graph search, ours performs search only during training and not testing, significantly decreasing the compute costs of deploying the learned policy. Empirically, we demonstrate that our method is more sample efficient than prior methods. Moreover, it is able to solve very long horizons manipulation and navigation tasks, tasks that prior goal-conditioned methods and methods based on graph search fail to solve.},
 author = {Zhang, Tianjun and Eysenbach, Benjamin and Salakhutdinov, Ruslan and Levine, Sergey and Gonzalez, Joseph E.},
 bdsk-url-1 = {https://arxiv.org/abs/2110.12080},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2110.12080},
 copyright = {Creative Commons Attribution 4.0 International},
 doi = {10.48550/ARXIV.2110.12080},
 keywords = {arxivpre},
 publisher = {arXiv},
 title = {C-Planning: An Automatic Curriculum for Learning Goal-Reaching Tasks},
 url = {https://arxiv.org/abs/2110.12080},
 year = {2021}
}

Daniel Rothchild, Alex Tamkin, Julie Yu, Ujval Misra, and Joseph Gonzalez. "C5T5: Controllable Generation of Organic Molecules with Transformers." arXiv, 2021.

Methods for designing organic materials with desired properties have high potential impact across fields such as medicine, renewable energy, petrochemical engineering, and agriculture. However, using generative modeling to design substances with desired properties is difficult because candidate compounds must satisfy multiple constraints, including synthetic accessibility and other metrics that are intuitive to domain experts but challenging to quantify. We propose C5T5, a novel self-supervised pretraining method that enables transformers to make zero-shot select-and-replace edits, altering organic substances towards desired property values. C5T5 operates on IUPAC names -- a standardized molecular representation that intuitively encodes rich structural information for organic chemists but that has been largely ignored by the ML community. Our technique requires no edited molecule pairs to train and only a rough estimate of molecular properties, and it has the potential to model long-range dependencies and symmetric molecular structures more easily than graph-based methods. C5T5 also provides a powerful interface to domain experts: it grants users fine-grained control over the generative process by selecting and replacing IUPAC name fragments, which enables experts to leverage their intuitions about structure-activity relationships. We demonstrate C5T5's effectiveness on four physical properties relevant for drug discovery, showing that it learns successful and chemically intuitive strategies for altering molecules towards desired property values.

@misc{c5t521,
 abstract = {Methods for designing organic materials with desired properties have high potential impact across fields such as medicine, renewable energy, petrochemical engineering, and agriculture. However, using generative modeling to design substances with desired properties is difficult because candidate compounds must satisfy multiple constraints, including synthetic accessibility and other metrics that are intuitive to domain experts but challenging to quantify. We propose C5T5, a novel self-supervised pretraining method that enables transformers to make zero-shot select-and-replace edits, altering organic substances towards desired property values. C5T5 operates on IUPAC names -- a standardized molecular representation that intuitively encodes rich structural information for organic chemists but that has been largely ignored by the ML community. Our technique requires no edited molecule pairs to train and only a rough estimate of molecular properties, and it has the potential to model long-range dependencies and symmetric molecular structures more easily than graph-based methods. C5T5 also provides a powerful interface to domain experts: it grants users fine-grained control over the generative process by selecting and replacing IUPAC name fragments, which enables experts to leverage their intuitions about structure-activity relationships. We demonstrate C5T5's effectiveness on four physical properties relevant for drug discovery, showing that it learns successful and chemically intuitive strategies for altering molecules towards desired property values.},
 archiveprefix = {arXiv},
 author = {Rothchild, Daniel and Tamkin, Alex and Yu, Julie and Misra, Ujval and Gonzalez, Joseph},
 bdsk-url-1 = {https://arxiv.org/abs/2108.10307},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2108.10307},
 copyright = {arXiv.org perpetual, non-exclusive license},
 doi = {10.48550/ARXIV.2108.10307},
 keywords = {arxivpre, Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
 publisher = {arXiv},
 title = {C5T5: Controllable Generation of Organic Molecules with Transformers},
 url = {https://arxiv.org/abs/2108.10307},
 year = {2021}
}

Robert Avram, Jeffrey Olgin, Alvin Wan, Zeeshan Ahmed, Louis Verreault-Julien, Sean Abreau, Derek Wan, Joseph E. Gonzalez, Derek So, Krishan Soni, and Geoffrey Tison. "CATHAI: FULLY AUTOMATED CORONARY ANGIOGRAPHY INTERPRETATION AND STENOSIS DETECTION USING A DEEP LEARNING-BASED ALGORITHMIC PIPELINE." Journal of the American College of Cardiology, 2021.

Coronary angiography is the gold standard for coronary heart disease (CHD) evaluation, but relies upon ad-hoc visual assessment which suffers from high variability and poor reproducibility. We developed a pipeline of deep neural network algorithms (CathAI) that accomplishes the tasks necessary for automated assessment of coronary stenosis severity from coronary angiograms. CathAI used angiograms designed to flow sequentially from Algorithms 1 to 4, to achieve automated angiogram interpretation (Figure 1) using UCSF data from April 2008 to December 2019. CathAI-predicted stenosis severity was compared against the clinical angiographic report for that artery segment. A total of 13,843 angiographic studies were obtained from $11972$ patients. Algorithms 1-2 had positive predictive values and sensitivities of $\geq 90\%$ to identify angiographic projection angle and left/right coronary artery angiograms. Algorithm 3 identified $73.1\%$ of the stenoses that were described in procedural reports. To discriminate clinically significant ($\geq 70\%$) stenosis, Algorithm 4 exhibited an area under the curve of 0.836 ($95\%$ CI: $0.817-0.856$) at the artery-level; specificity was $80.0\%$ at a sensitivity of $71.0\%$. Fully automated coronary angiogram analysis is feasible. CathAI demonstrated proficiency in all tasks required to interpret real-world angiograms. Deployment of CathAI may serve to increase standardization and reproducibility in coronary stenosis assessment.

@article{Avram21,
 abstract = {
Coronary angiography is the gold standard for coronary heart disease (CHD) evaluation, but relies upon ad-hoc visual assessment which suffers from high variability and poor reproducibility. We developed a pipeline of deep neural network algorithms (CathAI) that accomplishes the tasks necessary for automated assessment of coronary stenosis severity from coronary angiograms.

CathAI used angiograms designed to flow sequentially from Algorithms 1 to 4, to achieve automated angiogram interpretation (Figure 1) using UCSF data from April 2008 to December 2019. CathAI-predicted stenosis severity was compared against the clinical angiographic report for that artery segment.

A total of 13,843 angiographic studies were obtained from $11972$ patients. Algorithms 1-2 had positive predictive values and sensitivities of $\geq 90\%$ to identify angiographic projection angle and left/right coronary artery angiograms. Algorithm 3 identified $73.1\%$ of the stenoses that were described in procedural reports. To discriminate clinically significant ($\geq 70\%$) stenosis, Algorithm 4 exhibited an area under the curve of 0.836 ($95\%$ CI: $0.817-0.856$) at the artery-level; specificity was $80.0\%$ at a sensitivity of $71.0\%$. Fully automated coronary angiogram analysis is feasible. CathAI demonstrated proficiency in all tasks required to interpret real-world angiograms. Deployment of CathAI may serve to increase standardization and reproducibility in coronary stenosis assessment.
},
 author = {Robert Avram and Jeffrey Olgin and Alvin Wan and Zeeshan Ahmed and Louis Verreault-Julien and Sean Abreau and Derek Wan and Joseph E. Gonzalez and Derek So and Krishan Soni and Geoffrey Tison},
 bdsk-url-2 = {https://doi.org/10.1016/S0735-1097(21)04598-8},
 doi = {10.1016/S0735-1097(21)04598-8},
 eprint = {https://www.jacc.org/doi/pdf/10.1016/S0735-1097\%2821\%2904598-8},
 journal = {Journal of the American College of Cardiology},
 keywords = {peerrev, selected},
 number = {18\_Supplement\_1},
 pages = {3244-3244},
 title = {CATHAI: FULLY AUTOMATED CORONARY ANGIOGRAPHY INTERPRETATION AND STENOSIS DETECTION USING A DEEP LEARNING-BASED ALGORITHMIC PIPELINE},
 url = {https://www.jacc.org/doi/abs/10.1016/S0735-1097%2821%2904598-8},
 volume = {77},
 year = {2021}
}

Nicholas Rhinehart, Jeff He, Charles Packer, Matthew A. Wright, Rowan McAllister, Joseph E. Gonzalez, and Sergey Levine. "Contingencies from Observations: Tractable Contingency Planning with Learned Behavior Models." arXiv, 2021.

Humans have a remarkable ability to make decisions by accurately reasoning about future events, including the future behaviors and states of mind of other agents. Consider driving a car through a busy intersection: it is necessary to reason about the physics of the vehicle, the intentions of other drivers, and their beliefs about your own intentions. If you signal a turn, another driver might yield to you, or if you enter the passing lane, another driver might decelerate to give you room to merge in front. Competent drivers must plan how they can safely react to a variety of potential future behaviors of other agents before they make their next move. This requires contingency planning: explicitly planning a set of conditional actions that depend on the stochastic outcome of future events. In this work, we develop a general-purpose contingency planner that is learned end-to-end using high-dimensional scene observations and low-dimensional behavioral observations. We use a conditional autoregressive flow model to create a compact contingency planning space, and show how this model can tractably learn contingencies from behavioral observations. We developed a closed-loop control benchmark of realistic multi-agent scenarios in a driving simulator (CARLA), on which we compare our method to various noncontingent methods that reason about multi-agent future behavior, including several state-of-the-art deep learning-based planning approaches. We illustrate that these noncontingent planning methods fundamentally fail on this benchmark, and find that our deep contingency planning method achieves significantly superior performance. Code to run our benchmark and reproduce our results is available at \url{https://sites.google.com/view/contingency-planning}.

@misc{Rinehart21,
 abstract = {Humans have a remarkable ability to make decisions by accurately reasoning about future events, including the future behaviors and states of mind of other agents. Consider driving a car through a busy intersection: it is necessary to reason about the physics of the vehicle, the intentions of other drivers, and their beliefs about your own intentions. If you signal a turn, another driver might yield to you, or if you enter the passing lane, another driver might decelerate to give you room to merge in front. Competent drivers must plan how they can safely react to a variety of potential future behaviors of other agents before they make their next move. This requires contingency planning: explicitly planning a set of conditional actions that depend on the stochastic outcome of future events. In this work, we develop a general-purpose contingency planner that is learned end-to-end using high-dimensional scene observations and low-dimensional behavioral observations. We use a conditional autoregressive flow model to create a compact contingency planning space, and show how this model can tractably learn contingencies from behavioral observations. We developed a closed-loop control benchmark of realistic multi-agent scenarios in a driving simulator (CARLA), on which we compare our method to various noncontingent methods that reason about multi-agent future behavior, including several state-of-the-art deep learning-based planning approaches. We illustrate that these noncontingent planning methods fundamentally fail on this benchmark, and find that our deep contingency planning method achieves significantly superior performance. Code to run our benchmark and reproduce our results is available at \url{https://sites.google.com/view/contingency-planning}.},
 author = {Rhinehart, Nicholas and He, Jeff and Packer, Charles and Wright, Matthew A. and McAllister, Rowan and Gonzalez, Joseph E. and Levine, Sergey},
 bdsk-url-1 = {https://arxiv.org/abs/2104.10558},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2104.10558},
 code = {https://sites.google.com/view/contingency-planning},
 copyright = {Creative Commons Attribution 4.0 International},
 doi = {10.48550/ARXIV.2104.10558},
 keywords = {arxivpre, Robotics (cs.RO), Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
 publisher = {arXiv},
 title = {Contingencies from Observations: Tractable Contingency Planning with Learned Behavior Models},
 url = {https://arxiv.org/abs/2104.10558},
 year = {2021}
}

Vainavi Viswanath, Jennifer Grannen, Priya Sundaresan, Brijen Thananjeyan, Ashwin Balakrishna, Ellen Novoseller, Jeffrey Ichnowski, Michael Laskey, Joseph E. Gonzalez, and Ken Goldberg. "Disentangling Dense Multi-Cable Knots." 2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), 2021.

Disentangling two or more cables requires many steps to remove crossings between and within cables. We formalize the problem of disentangling multiple cables and present an algorithm, Iterative Reduction Of Non-planar Multiple cAble kNots (IRON-MAN), that outputs robot actions to remove crossings from multi-cable knotted structures. We instantiate this algorithm with a learned perception system, inspired by prior work in single-cable untying that given an image input, can disentangle two-cable twists, three-cable braids, and knots of two or three cables, such as overhand, square, carrick bend, sheet bend, crown, and fisherman's knots. IRON-MAN keeps track of task-relevant keypoints corresponding to target cable endpoints and crossings and iteratively disentangles the cables by identifying and undoing crossings that are critical to knot structure. Using a da Vinci surgical robot, we experimentally evaluate the effectiveness of IRON-MAN on untangling multi-cable knots of types that appear in the training data, as well as generalizing to novel classes of multi-cable knots. Results suggest that IRON-MAN is effective in disentangling knots involving up to three cables with 80.5\% success and generalizing to knot types that are not present during training, with cables of both distinct or identical colors.

@inproceedings{Viswanath21,
 abstract = {Disentangling two or more cables requires many steps to remove crossings between and within cables. We formalize the problem of disentangling multiple cables and present an algorithm, Iterative Reduction Of Non-planar Multiple cAble kNots (IRON-MAN), that outputs robot actions to remove crossings from multi-cable knotted structures. We instantiate this algorithm with a learned perception system, inspired by prior work in single-cable untying that given an image input, can disentangle two-cable twists, three-cable braids, and knots of two or three cables, such as overhand, square, carrick bend, sheet bend, crown, and fisherman's knots. IRON-MAN keeps track of task-relevant keypoints corresponding to target cable endpoints and crossings and iteratively disentangles the cables by identifying and undoing crossings that are critical to knot structure. Using a da Vinci surgical robot, we experimentally evaluate the effectiveness of IRON-MAN on untangling multi-cable knots of types that appear in the training data, as well as generalizing to novel classes of multi-cable knots. Results suggest that IRON-MAN is effective in disentangling knots involving up to three cables with 80.5\% success and generalizing to knot types that are not present during training, with cables of both distinct or identical colors.},
 author = {Viswanath, Vainavi and Grannen, Jennifer and Sundaresan, Priya and Thananjeyan, Brijen and Balakrishna, Ashwin and Novoseller, Ellen and Ichnowski, Jeffrey and Laskey, Michael and Gonzalez, Joseph E. and Goldberg, Ken},
 bdsk-url-1 = {https://arxiv.org/abs/2106.02252},
 bdsk-url-2 = {https://doi.org/10.1109/IROS51168.2021.9636397},
 booktitle = {2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
 doi = {10.1109/IROS51168.2021.9636397},
 keywords = {peerrev, selected},
 pages = {3731-3738},
 title = {Disentangling Dense Multi-Cable Knots},
 url = {https://arxiv.org/abs/2106.02252},
 year = {2021}
}

Lisa Dunlap, Kirthevasan Kandasamy, Ujval Misra, Richard Liaw, Michael Jordan, Ion Stoica, and Joseph E. Gonzalez. "Elastic Hyperparameter Tuning on the Cloud." Proceedings of the ACM Symposium on Cloud Computing, 2021.

Hyperparameter tuning is a necessary step in training and deploying machine learning models. Most prior work on hyperparameter tuning has studied methods for maximizing model accuracy under a time constraint, assuming a fixed cluster size. While this is appropriate in data center environments, the increased deployment of machine learning workloads in cloud settings necessitates studying hyperparameter tuning with an elastic cluster size and time and monetary budgets. While recent work has leveraged the elasticity of the cloud to minimize the execution cost of a pre-determined hyperparameter tuning job originally designed for fixed-cluster sizes, they do not aim to maximize accuracy.In this work, we aim to maximize accuracy given time and cost constraints. We introduce SEER---Sequential Elimination with Elastic Resources, an algorithm that tests different hyperparameter values in the beginning and maintains varying degrees of parallelism among the promising configurations to ensure that they are trained sufficiently before the deadline. Unlike fixed cluster size methods, it is able to exploit the flexibility in resource allocation the elastic setting has to offer in order to avoid undesirable effects of sublinear scaling. Furthermore, SEER can be easily integrated into existing systems and makes minimal assumptions about the workload. On a suite of benchmarks, we demonstrate that SEER outperforms both existing methods for hyperparameter tuning on a fixed cluster as well as naive extensions of these algorithms to the cloud setting.

@inproceedings{Dunlap21,
 abstract = {Hyperparameter tuning is a necessary step in training and deploying machine learning models. Most prior work on hyperparameter tuning has studied methods for maximizing model accuracy under a time constraint, assuming a fixed cluster size. While this is appropriate in data center environments, the increased deployment of machine learning workloads in cloud settings necessitates studying hyperparameter tuning with an elastic cluster size and time and monetary budgets. While recent work has leveraged the elasticity of the cloud to minimize the execution cost of a pre-determined hyperparameter tuning job originally designed for fixed-cluster sizes, they do not aim to maximize accuracy.In this work, we aim to maximize accuracy given time and cost constraints. We introduce SEER---Sequential Elimination with Elastic Resources, an algorithm that tests different hyperparameter values in the beginning and maintains varying degrees of parallelism among the promising configurations to ensure that they are trained sufficiently before the deadline. Unlike fixed cluster size methods, it is able to exploit the flexibility in resource allocation the elastic setting has to offer in order to avoid undesirable effects of sublinear scaling. Furthermore, SEER can be easily integrated into existing systems and makes minimal assumptions about the workload. On a suite of benchmarks, we demonstrate that SEER outperforms both existing methods for hyperparameter tuning on a fixed cluster as well as naive extensions of these algorithms to the cloud setting.},
 address = {New York, NY, USA},
 author = {Dunlap, Lisa and Kandasamy, Kirthevasan and Misra, Ujval and Liaw, Richard and Jordan, Michael and Stoica, Ion and Gonzalez, Joseph E.},
 bdsk-url-1 = {https://doi.org/10.1145/3472883.3486989},
 booktitle = {Proceedings of the ACM Symposium on Cloud Computing},
 doi = {10.1145/3472883.3486989},
 isbn = {9781450386388},
 keywords = {peerrev, selected},
 location = {Seattle, WA, USA},
 numpages = {14},
 pages = {33--46},
 publisher = {Association for Computing Machinery},
 series = {SoCC '21},
 title = {Elastic Hyperparameter Tuning on the Cloud},
 url = {https://doi.org/10.1145/3472883.3486989},
 year = {2021}
}

Doris Xin, Devin Petersohn, Dixin Tang, Yifan Wu, Joseph E. Gonzalez, Joseph M. Hellerstein, Anthony D. Joseph, and Aditya G. Parameswaran. "Enhancing the Interactivity of Dataframe Queries by Leveraging Think Time." arXiv, 2021.

We propose opportunistic evaluation, a framework for accelerating interactions with dataframes. Interactive latency is critical for iterative, human-in-the-loop dataframe workloads for supporting exploratory data analysis. Opportunistic evaluation significantly reduces interactive latency by 1) prioritizing computation directly relevant to the interactions and 2) leveraging think time for asynchronous background computation for non-critical operators that might be relevant to future interactions. We show, through empirical analysis, that current user behavior presents ample opportunities for optimization, and the solutions we propose effectively harness such opportunities.

@misc{Xin21,
 abstract = {We propose opportunistic evaluation, a framework for accelerating interactions with dataframes. Interactive latency is critical for iterative, human-in-the-loop dataframe workloads for supporting exploratory data analysis. Opportunistic evaluation significantly reduces interactive latency by 1) prioritizing computation directly relevant to the interactions and 2) leveraging think time for asynchronous background computation for non-critical operators that might be relevant to future interactions. We show, through empirical analysis, that current user behavior presents ample opportunities for optimization, and the solutions we propose effectively harness such opportunities.},
 author = {Xin, Doris and Petersohn, Devin and Tang, Dixin and Wu, Yifan and Gonzalez, Joseph E. and Hellerstein, Joseph M. and Joseph, Anthony D. and Parameswaran, Aditya G.},
 bdsk-url-1 = {https://arxiv.org/abs/2103.02145},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2103.02145},
 copyright = {Creative Commons Attribution 4.0 International},
 doi = {10.48550/ARXIV.2103.02145},
 keywords = {arxivpre, Databases (cs.DB), FOS: Computer and information sciences, FOS: Computer and information sciences},
 publisher = {arXiv},
 title = {Enhancing the Interactivity of Dataframe Queries by Leveraging Think Time},
 url = {https://arxiv.org/abs/2103.02145},
 year = {2021}
}

Yu Gai, Paras Jain, Wendi Zhang, Joseph E. Gonzalez, Dawn Song, and Ion Stoica. "Grounded Graph Decoding Improves Compositional Generalization in Question Answering." arXiv, 2021.

Question answering models struggle to generalize to novel compositions of training patterns, such to longer sequences or more complex test structures. Current end-to-end models learn a flat input embedding which can lose input syntax context. Prior approaches improve generalization by learning permutation invariant models, but these methods do not scale to more complex train-test splits. We propose Grounded Graph Decoding, a method to improve compositional generalization of language representations by grounding structured predictions with an attention mechanism. Grounding enables the model to retain syntax information from the input in thereby significantly improving generalization over complex inputs. By predicting a structured graph containing conjunctions of query clauses, we learn a group invariant representation without making assumptions on the target domain. Our model significantly outperforms state-of-the-art baselines on the Compositional Freebase Questions (CFQ) dataset, a challenging benchmark for compositional generalization in question answering. Moreover, we effectively solve the MCD1 split with 98\% accuracy.

@misc{Gai21,
 abstract = {Question answering models struggle to generalize to novel compositions of training patterns, such to longer sequences or more complex test structures. Current end-to-end models learn a flat input embedding which can lose input syntax context. Prior approaches improve generalization by learning permutation invariant models, but these methods do not scale to more complex train-test splits. We propose Grounded Graph Decoding, a method to improve compositional generalization of language representations by grounding structured predictions with an attention mechanism. Grounding enables the model to retain syntax information from the input in thereby significantly improving generalization over complex inputs. By predicting a structured graph containing conjunctions of query clauses, we learn a group invariant representation without making assumptions on the target domain. Our model significantly outperforms state-of-the-art baselines on the Compositional Freebase Questions (CFQ) dataset, a challenging benchmark for compositional generalization in question answering. Moreover, we effectively solve the MCD1 split with 98\% accuracy.},
 author = {Gai, Yu and Jain, Paras and Zhang, Wendi and Gonzalez, Joseph E. and Song, Dawn and Stoica, Ion},
 bdsk-url-1 = {https://arxiv.org/abs/2111.03642},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2111.03642},
 copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},
 doi = {10.48550/ARXIV.2111.03642},
 keywords = {arxivpre},
 publisher = {arXiv},
 title = {Grounded Graph Decoding Improves Compositional Generalization in Question Answering},
 url = {https://arxiv.org/abs/2111.03642},
 year = {2021}
}

Charles Packer, Pieter Abbeel, and Joseph E Gonzalez. "Hindsight Task Relabelling: Experience Replay for Sparse Reward Meta-RL." Advances in Neural Information Processing Systems, 2021.

Meta-reinforcement learning (meta-RL) has proven to be a successful framework for leveraging experience from prior tasks to rapidly learn new related tasks, however, current meta-RL approaches struggle to learn in sparse reward environments. Although existing meta-RL algorithms can learn strategies for adapting to new sparse reward tasks, the actual adaptation strategies are learned using hand-shaped reward functions, or require simple environments where random exploration is sufficient to encounter sparse reward. In this paper we present a formulation of hindsight relabelling for meta-RL, which relabels experience during meta-training to enable learning to learn entirely using sparse reward. We demonstrate the effectiveness of our approach on a suite of challenging sparse reward environments that previously required dense reward during meta-training to solve. Our approach solves these environments using the true sparse reward function, with performance comparable to training with a proxy dense reward function.

@inproceedings{NEURIPS2021_1454ca22,
 abstract = {Meta-reinforcement learning (meta-RL) has proven to be a successful framework for leveraging experience from prior tasks to rapidly learn new related tasks, however, current meta-RL approaches struggle to learn in sparse reward environments. Although existing meta-RL algorithms can learn strategies for adapting to new sparse reward tasks, the actual adaptation strategies are learned using hand-shaped reward functions, or require simple environments where random exploration is sufficient to encounter sparse reward. In this paper we present a formulation of hindsight relabelling for meta-RL, which relabels experience during meta-training to enable learning to learn entirely using sparse reward. We demonstrate the effectiveness of our approach on a suite of challenging sparse reward environments that previously required dense reward during meta-training to solve. Our approach solves these environments using the true sparse reward function, with performance comparable to training with a proxy dense reward function.},
 author = {Packer, Charles and Abbeel, Pieter and Gonzalez, Joseph E},
 bdsk-url-1 = {https://proceedings.neurips.cc/paper/2021/file/1454ca2270599546dfcd2a3700e4d2f1-Paper.pdf},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan},
 keywords = {peerrev, selected},
 pages = {2466--2477},
 publisher = {Curran Associates, Inc.},
 title = {Hindsight Task Relabelling: Experience Replay for Sparse Reward Meta-RL},
 url = {https://proceedings.neurips.cc/paper/2021/file/1454ca2270599546dfcd2a3700e4d2f1-Paper.pdf},
 volume = {34},
 year = {2021}
}

Wenshuo Guo, Kirthevasan Kandasamy, Joseph E Gonzalez, Michael I. Jordan, and Ion Stoica. "Learning Competitive Equilibria in Exchange Economies with Bandit Feedback." arXiv, 2021.

The sharing of scarce resources among multiple rational agents is one of the classical problems in economics. In exchange economies, which are used to model such situations, agents begin with an initial endowment of resources and exchange them in a way that is mutually beneficial until they reach a competitive equilibrium (CE). The allocations at a CE are Pareto efficient and fair. Consequently, they are used widely in designing mechanisms for fair division. However, computing CEs requires the knowledge of agent preferences which are unknown in several applications of interest. In this work, we explore a new online learning mechanism, which, on each round, allocates resources to the agents and collects stochastic feedback on their experience in using that allocation. Its goal is to learn the agent utilities via this feedback and imitate the allocations at a CE in the long run. We quantify CE behavior via two losses and propose a randomized algorithm which achieves sublinear loss under a parametric class of utilities. Empirically, we demonstrate the effectiveness of this mechanism through numerical simulations.

@misc{Guo21,
 abstract = {The sharing of scarce resources among multiple rational agents is one of the classical problems in economics. In exchange economies, which are used to model such situations, agents begin with an initial endowment of resources and exchange them in a way that is mutually beneficial until they reach a competitive equilibrium (CE). The allocations at a CE are Pareto efficient and fair. Consequently, they are used widely in designing mechanisms for fair division. However, computing CEs requires the knowledge of agent preferences which are unknown in several applications of interest. In this work, we explore a new online learning mechanism, which, on each round, allocates resources to the agents and collects stochastic feedback on their experience in using that allocation. Its goal is to learn the agent utilities via this feedback and imitate the allocations at a CE in the long run. We quantify CE behavior via two losses and propose a randomized algorithm which achieves sublinear loss under a parametric class of utilities. Empirically, we demonstrate the effectiveness of this mechanism through numerical simulations.},
 author = {Guo, Wenshuo and Kandasamy, Kirthevasan and Gonzalez, Joseph E and Jordan, Michael I. and Stoica, Ion},
 bdsk-url-1 = {https://arxiv.org/abs/2106.06616},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2106.06616},
 copyright = {arXiv.org perpetual, non-exclusive license},
 doi = {10.48550/ARXIV.2106.06616},
 keywords = {arxivpre, Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
 publisher = {arXiv},
 title = {Learning Competitive Equilibria in Exchange Economies with Bandit Feedback},
 url = {https://arxiv.org/abs/2106.06616},
 year = {2021}
}

Kevin Yang, Tianjun Zhang, Chris Cummins, Brandon Cui, Benoit Steiner, Linnan Wang, Joseph E Gonzalez, Dan Klein, and Yuandong Tian. "Learning Space Partitions for Path Planning." Advances in Neural Information Processing Systems, 2021.

Path planning, the problem of efficiently discovering high-reward trajectories, often requires optimizing a high-dimensional and multimodal reward function. Popular approaches like CEM and CMA-ES greedily focus on promising regions of the search space and may get trapped in local maxima. DOO and VOOT balance exploration and exploitation, but use space partitioning strategies independent of the reward function to be optimized. Recently, LaMCTS empirically learns to partition the search space in a reward-sensitive manner for black-box optimization. In this paper, we develop a novel formal regret analysis for when and why such an adaptive region partitioning scheme works. We also propose a new path planning method LaP3 which improves the function value estimation within each sub-region, and uses a latent representation of the search space. Empirically, LaP3 outperforms existing path planning methods in 2D navigation tasks, especially in the presence of difficult-to-escape local optima, and shows benefits when plugged into the planning components of model-based RL such as PETS. These gains transfer to highly multimodal real-world tasks, where we outperform strong baselines in compiler phase ordering by up to 39\% on average across 9 tasks, and in molecular design by up to 0.4 on properties on a 0-1 scale. Code is available at \url{https://github.com/yangkevin2/neurips2021-lap3}.

@inproceedings{NEURIPS2021_03a3655f,
 abstract = {Path planning, the problem of efficiently discovering high-reward trajectories, often requires optimizing a high-dimensional and multimodal reward function. Popular approaches like CEM and CMA-ES greedily focus on promising regions of the search space and may get trapped in local maxima. DOO and VOOT balance exploration and exploitation, but use space partitioning strategies independent of the reward function to be optimized. Recently, LaMCTS empirically learns to partition the search space in a reward-sensitive manner for black-box optimization. In this paper, we develop a novel formal regret analysis for when and why such an adaptive region partitioning scheme works. We also propose a new path planning method LaP3 which improves the function value estimation within each sub-region, and uses a latent representation of the search space. Empirically, LaP3 outperforms existing path planning methods in 2D navigation tasks, especially in the presence of difficult-to-escape local optima, and shows benefits when plugged into the planning components of model-based RL such as PETS. These gains transfer to highly multimodal real-world tasks, where we outperform strong baselines in compiler phase ordering by up to 39\% on average across 9 tasks, and in molecular design by up to 0.4 on properties on a 0-1 scale. Code is available at \url{https://github.com/yangkevin2/neurips2021-lap3}.},
 author = {Yang, Kevin and Zhang, Tianjun and Cummins, Chris and Cui, Brandon and Steiner, Benoit and Wang, Linnan and Gonzalez, Joseph E and Klein, Dan and Tian, Yuandong},
 bdsk-url-1 = {https://proceedings.neurips.cc/paper/2021/file/03a3655fff3e9bdea48de9f49e938e32-Paper.pdf},
 booktitle = {Advances in Neural Information Processing Systems},
 code = {https://github.com/yangkevin2/neurips2021-lap3},
 editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan},
 keywords = {peerrev, selected},
 pages = {378--391},
 publisher = {Curran Associates, Inc.},
 title = {Learning Space Partitions for Path Planning},
 url = {https://proceedings.neurips.cc/paper/2021/file/03a3655fff3e9bdea48de9f49e938e32-Paper.pdf},
 volume = {34},
 year = {2021}
}

Tianjun Zhang, Paria Rashidinejad, Jiantao Jiao, Yuandong Tian, Joseph E Gonzalez, and Stuart Russell. "MADE: Exploration via Maximizing Deviation from Explored Regions." Advances in Neural Information Processing Systems, 2021.

In online reinforcement learning (RL), efficient exploration remains particularly challenging in high-dimensional environments with sparse rewards. In low-dimensional environments, where tabular parameterization is possible, count-based upper confidence bound (UCB) exploration methods achieve minimax near-optimal rates. However, it remains unclear how to efficiently implement UCB in realistic RL tasks that involve non-linear function approximation. To address this, we propose a new exploration approach via maximizing the deviation of the occupancy of the next policy from the explored regions. We add this term as an adaptive regularizer to the standard RL objective to balance exploration vs. exploitation. We pair the new objective with a provably convergent algorithm, giving rise to a new intrinsic reward that adjusts existing bonuses. The proposed intrinsic reward is easy to implement and combine with other existing RL algorithms to conduct exploration. As a proof of concept, we evaluate the new intrinsic reward on tabular examples across a variety of model-based and model-free algorithms, showing improvements over count-only exploration strategies. When tested on navigation and locomotion tasks from MiniGrid and DeepMind Control Suite benchmarks, our approach significantly improves sample efficiency over state-of-the-art methods.

@inproceedings{Zhang21a,
 abstract = {In online reinforcement learning (RL), efficient exploration remains particularly challenging in high-dimensional environments with sparse rewards. In low-dimensional environments, where tabular parameterization is possible, count-based upper confidence bound (UCB) exploration methods achieve minimax near-optimal rates. However, it remains unclear how to efficiently implement UCB in realistic RL tasks that involve non-linear function approximation. To address this, we propose a new exploration approach via maximizing the deviation of the occupancy of the next policy from the explored regions. We add this term as an adaptive regularizer to the standard RL objective to balance exploration vs. exploitation. We pair the new objective with a provably convergent algorithm, giving rise to a new intrinsic reward that adjusts existing bonuses. The proposed intrinsic reward is easy to implement and combine with other existing RL algorithms to conduct exploration. As a proof of concept, we evaluate the new intrinsic reward on tabular examples across a variety of model-based and model-free algorithms, showing improvements over count-only exploration strategies. When tested on navigation and locomotion tasks from MiniGrid and DeepMind Control Suite benchmarks, our approach significantly improves sample efficiency over state-of-the-art methods.},
 author = {Zhang, Tianjun and Rashidinejad, Paria and Jiao, Jiantao and Tian, Yuandong and Gonzalez, Joseph E and Russell, Stuart},
 bdsk-url-1 = {https://proceedings.neurips.cc/paper/2021/file/5011bf6d8a37692913fce3a15a51f070-Paper.pdf},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan},
 keywords = {peerrev, selected},
 pages = {9663--9680},
 publisher = {Curran Associates, Inc.},
 title = {MADE: Exploration via Maximizing Deviation from Explored Regions},
 url = {https://proceedings.neurips.cc/paper/2021/file/5011bf6d8a37692913fce3a15a51f070-Paper.pdf},
 volume = {34},
 year = {2021}
}

Alvin Wan, Lisa Dunlap, Daniel Ho, Jihan Yin, Scott Lee, Suzanne Petryk, Sarah Adel Bargal, and Joseph E. Gonzalez. "NBDT: Neural-Backed Decision Tree." International Conference on Learning Representations, 2021.

Machine learning applications such as finance and medicine demand accurate and justifiable predictions, barring most deep learning methods from use. In response, previous work combines decision trees with deep learning, yielding models that (1) sacrifice interpretability for accuracy or (2) sacrifice accuracy for interpretability. We forgo this dilemma by jointly improving accuracy and interpretability using Neural-Backed Decision Trees (NBDTs). NBDTs replace a neural network's final linear layer with a differentiable sequence of decisions and a surrogate loss. This forces the model to learn high-level concepts and lessens reliance on highly-uncertain decisions, yielding (1) accuracy: NBDTs match or outperform modern neural networks on CIFAR, ImageNet and better generalize to unseen classes by up to 16\%. Furthermore, our surrogate loss improves the original model's accuracy by up to 2\%. NBDTs also afford (2) interpretability: improving human trustby clearly identifying model mistakes and assisting in dataset debugging.

@inproceedings{wan2021nbdt,
 abstract = {Machine learning applications such as finance and medicine demand accurate and justifiable predictions, barring most deep learning methods from use. In response, previous work combines decision trees with deep learning, yielding models that (1) sacrifice interpretability for accuracy or (2) sacrifice accuracy for interpretability. We forgo this dilemma by jointly improving accuracy and interpretability using Neural-Backed Decision Trees (NBDTs). NBDTs replace a neural network's final linear layer with a differentiable sequence of decisions and a surrogate loss. This forces the model to learn high-level concepts and lessens reliance on highly-uncertain decisions, yielding (1) accuracy: NBDTs match or outperform modern neural networks on CIFAR, ImageNet and better generalize to unseen classes by up to 16\%. Furthermore, our surrogate loss improves the original model's accuracy by up to 2\%. NBDTs also afford (2) interpretability: improving human trustby clearly identifying model mistakes and assisting in dataset debugging.},
 author = {Alvin Wan and Lisa Dunlap and Daniel Ho and Jihan Yin and Scott Lee and Suzanne Petryk and Sarah Adel Bargal and Joseph E. Gonzalez},
 bdsk-url-1 = {https://openreview.net/forum?id=mCLVeEpplNE},
 booktitle = {International Conference on Learning Representations},
 code = {https://github.com/alvinwan/neural-backed-decision-trees},
 keywords = {peerrev, selected},
 title = { {NBDT}: Neural-Backed Decision Tree},
 url = {https://openreview.net/forum?id=mCLVeEpplNE},
 year = {2021}
}

Tianjun Zhang, Huazhe Xu, Xiaolong Wang, Yi Wu, Kurt Keutzer, Joseph E Gonzalez, and Yuandong Tian. "NovelD: A Simple yet Effective Exploration Criterion." Advances in Neural Information Processing Systems, 2021.

Efficient exploration under sparse rewards remains a key challenge in deep reinforcement learning. Previous exploration methods (e.g., RND) have achieved strong results in multiple hard tasks. However, if there are multiple novel areas to explore, these methods often focus quickly on one without sufficiently trying others (like a depth-wise first search manner). In some scenarios (e.g., four corridor environment in Sec 4.2), we observe they explore in one corridor for long and fail to cover all the states. On the other hand, in theoretical RL, with optimistic initialization and the inverse square root of visitation count as a bonus, it won't suffer from this and explores different novel regions alternatively (like a breadth-first search manner). In this paper, inspired by this, we propose a simple but effective criterion called NovelD by weighting every novel area approximately equally. Our algorithm is very simple but yet shows comparable performance or even outperforms multiple SOTA exploration methods in many hard exploration tasks. Specifically, NovelD solves all the static procedurally-generated tasks in Mini-Grid with just 120M environment steps, without any curriculum learning. In comparison, the previous SOTA only solves 50\% of them. NovelD also achieves SOTA on multiple tasks in NetHack, a rogue-like game that contains more challenging procedurally-generated environments. In multiple Atari games (e.g., MonteZuma's Revenge, Venture, Gravitar), NovelD outperforms RND. We analyze NovelD thoroughly in MiniGrid and found that empirically it helps the agent explore the environment more uniformly with a focus on exploring beyond the boundary.

@inproceedings{Zhang21b,
 abstract = {Efficient exploration under sparse rewards remains a key challenge in deep reinforcement learning. Previous exploration methods (e.g., RND) have achieved strong results in multiple hard tasks. However, if there are multiple novel areas to explore, these methods often focus quickly on one without sufficiently trying others (like a depth-wise first search manner). In some scenarios (e.g., four corridor environment in Sec 4.2), we observe they explore in one corridor for long and fail to cover all the states. On the other hand, in theoretical RL, with optimistic initialization and the inverse square root of visitation count as a bonus, it won't suffer from this and explores different novel regions alternatively (like a breadth-first search manner). In this paper, inspired by this, we propose a simple but effective criterion called NovelD by weighting every novel area approximately equally. Our algorithm is very simple but yet shows comparable performance or even outperforms multiple SOTA exploration methods in many hard exploration tasks. Specifically, NovelD solves all the static procedurally-generated tasks in Mini-Grid with just 120M environment steps, without any curriculum learning. In comparison, the previous SOTA only solves 50\% of them. NovelD also achieves SOTA on multiple tasks in NetHack, a rogue-like game that contains more challenging procedurally-generated environments. In multiple Atari games (e.g., MonteZuma's Revenge, Venture, Gravitar), NovelD outperforms RND. We analyze NovelD thoroughly in MiniGrid and found that empirically it helps the agent explore the environment more uniformly with a focus on exploring beyond the boundary.},
 author = {Zhang, Tianjun and Xu, Huazhe and Wang, Xiaolong and Wu, Yi and Keutzer, Kurt and Gonzalez, Joseph E and Tian, Yuandong},
 bdsk-url-1 = {https://proceedings.neurips.cc/paper/2021/file/d428d070622e0f4363fceae11f4a3576-Paper.pdf},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan},
 keywords = {peerrev, selected},
 pages = {25217--25230},
 publisher = {Curran Associates, Inc.},
 title = {NovelD: A Simple yet Effective Exploration Criterion},
 url = {https://proceedings.neurips.cc/paper/2021/file/d428d070622e0f4363fceae11f4a3576-Paper.pdf},
 volume = {34},
 year = {2021}
}

Brijen Thananjeyan, Kirthevasan Kandasamy, Ion Stoica, Michael I. Jordan, Ken Goldberg, and Joseph E. Gonzalez. "PAC Best Arm Identification Under a Deadline." arXiv, 2021.

We study (ϵ,δ)-PAC best arm identification, where a decision-maker must identify an ϵ-optimal arm with probability at least 1−δ, while minimizing the number of arm pulls (samples). Most of the work on this topic is in the sequential setting, where there is no constraint on the time taken to identify such an arm; this allows the decision-maker to pull one arm at a time. In this work, the decision-maker is given a deadline of T rounds, where, on each round, it can adaptively choose which arms to pull and how many times to pull them; this distinguishes the number of decisions made (i.e., time or number of rounds) from the number of samples acquired (cost). Such situations occur in clinical trials, where one may need to identify a promising treatment under a deadline while minimizing the number of test subjects, or in simulation-based studies run on the cloud, where we can elastically scale up or down the number of virtual machines to conduct as many experiments as we wish, but need to pay for the resource-time used. As the decision-maker can only make T decisions, she may need to pull some arms excessively relative to a sequential algorithm in order to perform well on all possible problems. We formalize this added difficulty with two hardness results that indicate that unlike sequential settings, the ability to adapt to the problem difficulty is constrained by the finite deadline. We propose Elastic Batch Racing (EBR), a novel algorithm for this setting and bound its sample complexity, showing that EBR is optimal with respect to both hardness results. We present simulations evaluating EBR in this setting, where it outperforms baselines by several orders of magnitude.

@misc{Thananjeyan21b,
 abstract = {We study (ϵ,δ)-PAC best arm identification, where a decision-maker must identify an ϵ-optimal arm with probability at least 1−δ, while minimizing the number of arm pulls (samples). Most of the work on this topic is in the sequential setting, where there is no constraint on the time taken to identify such an arm; this allows the decision-maker to pull one arm at a time. In this work, the decision-maker is given a deadline of T rounds, where, on each round, it can adaptively choose which arms to pull and how many times to pull them; this distinguishes the number of decisions made (i.e., time or number of rounds) from the number of samples acquired (cost). Such situations occur in clinical trials, where one may need to identify a promising treatment under a deadline while minimizing the number of test subjects, or in simulation-based studies run on the cloud, where we can elastically scale up or down the number of virtual machines to conduct as many experiments as we wish, but need to pay for the resource-time used. As the decision-maker can only make T decisions, she may need to pull some arms excessively relative to a sequential algorithm in order to perform well on all possible problems. We formalize this added difficulty with two hardness results that indicate that unlike sequential settings, the ability to adapt to the problem difficulty is constrained by the finite deadline. We propose Elastic Batch Racing (EBR), a novel algorithm for this setting and bound its sample complexity, showing that EBR is optimal with respect to both hardness results. We present simulations evaluating EBR in this setting, where it outperforms baselines by several orders of magnitude.},
 author = {Thananjeyan, Brijen and Kandasamy, Kirthevasan and Stoica, Ion and Jordan, Michael I. and Goldberg, Ken and Gonzalez, Joseph E.},
 bdsk-url-1 = {https://arxiv.org/abs/2106.03221},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2106.03221},
 copyright = {arXiv.org perpetual, non-exclusive license},
 doi = {10.48550/ARXIV.2106.03221},
 keywords = {arxivpre, Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
 publisher = {arXiv},
 title = {PAC Best Arm Identification Under a Deadline},
 url = {https://arxiv.org/abs/2106.03221},
 year = {2021}
}

Eric Liang, Zhanghao Wu, Michael Luo, Sven Mika, Joseph E Gonzalez, and Ion Stoica. "RLlib Flow: Distributed Reinforcement Learning is a Dataflow Problem." Advances in Neural Information Processing Systems, 2021.

Researchers and practitioners in the field of reinforcement learning (RL) frequently leverage parallel computation, which has led to a plethora of new algorithms and systems in the last few years. In this paper, we re-examine the challenges posed by distributed RL and try to view it through the lens of an old idea: distributed dataflow. We show that viewing RL as a dataflow problem leads to highly composable and performant implementations. We propose RLlib Flow, a hybrid actor-dataflow programming model for distributed RL, and validate its practicality by porting the full suite of algorithms in RLlib, a widely adopted distributed RL library. Concretely, RLlib Flow provides 2-9x code savings in real production code and enables the composition of multi-agent algorithms not possible by end users before. The open-source code is available as part of RLlib at \url{https://github.com/ray-project/ray/tree/master/rllib}.

@inproceedings{NEURIPS2021_2bce32ed,
 abstract = {Researchers and practitioners in the field of reinforcement learning (RL) frequently leverage parallel computation, which has led to a plethora of new algorithms and systems in the last few years. In this paper, we re-examine the challenges posed by distributed RL and try to view it through the lens of an old idea: distributed dataflow. We show that viewing RL as a dataflow problem leads to highly composable and performant implementations. We propose RLlib Flow, a hybrid actor-dataflow programming model for distributed RL, and validate its practicality by porting the full suite of algorithms in RLlib, a widely adopted distributed RL library. Concretely, RLlib Flow provides 2-9x code savings in real production code and enables the composition of multi-agent algorithms not possible by end users before. The open-source code is available as part of RLlib at \url{https://github.com/ray-project/ray/tree/master/rllib}.},
 author = {Liang, Eric and Wu, Zhanghao and Luo, Michael and Mika, Sven and Gonzalez, Joseph E and Stoica, Ion},
 bdsk-url-1 = {https://proceedings.neurips.cc/paper/2021/file/2bce32ed409f5ebcee2a7b417ad9beed-Paper.pdf},
 booktitle = {Advances in Neural Information Processing Systems},
 code = {https://github.com/ray-project/ray/tree/master/rllib.},
 editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan},
 keywords = {peerrev, selected},
 pages = {5506--5517},
 publisher = {Curran Associates, Inc.},
 title = {RLlib Flow: Distributed Reinforcement Learning is a Dataflow Problem},
 url = {https://proceedings.neurips.cc/paper/2021/file/2bce32ed409f5ebcee2a7b417ad9beed-Paper.pdf},
 volume = {34},
 year = {2021}
}

Sayna Ebrahimi, Suzanne Petryk, Akash Gokul, William Gan, Joseph E. Gonzalez, Marcus Rohrbach, and Trevor Darrell. "Remembering for the right reasons: Explanations reduce catastrophic forgetting." Applied AI Letters, 2021.

Abstract The goal of continual learning (CL) is to learn a sequence of tasks without suffering from the phenomenon of catastrophic forgetting. Previous work has shown that leveraging memory in the form of a replay buffer can reduce performance degradation on prior tasks. We hypothesize that forgetting can be further reduced when the model is encouraged to remember the evidence for previously made decisions. As a first step towards exploring this hypothesis, we propose a simple novel training paradigm, called Remembering for the Right Reasons (RRR), that additionally stores visual model explanations for each example in the buffer and ensures the model has ``the right reasons'' for its predictions by encouraging its explanations to remain consistent with those used to make decisions at training time. Without this constraint, there is a drift in explanations and increase in forgetting as conventional continual learning algorithms learn new tasks. We demonstrate how RRR can be easily added to any memory or regularization-based approach and results in reduced forgetting, and more importantly, improved model explanations. We have evaluated our approach in the standard and few-shot settings and observed a consistent improvement across various CL approaches using different architectures and techniques to generate model explanations and demonstrated our approach showing a promising connection between explainability and continual learning. Our code is available at https://github.com/SaynaEbrahimi/Remembering-for-the-Right-Reasons.

@article{RRR21,
 abstract = {Abstract The goal of continual learning (CL) is to learn a sequence of tasks without suffering from the phenomenon of catastrophic forgetting. Previous work has shown that leveraging memory in the form of a replay buffer can reduce performance degradation on prior tasks. We hypothesize that forgetting can be further reduced when the model is encouraged to remember the evidence for previously made decisions. As a first step towards exploring this hypothesis, we propose a simple novel training paradigm, called Remembering for the Right Reasons (RRR), that additionally stores visual model explanations for each example in the buffer and ensures the model has ``the right reasons'' for its predictions by encouraging its explanations to remain consistent with those used to make decisions at training time. Without this constraint, there is a drift in explanations and increase in forgetting as conventional continual learning algorithms learn new tasks. We demonstrate how RRR can be easily added to any memory or regularization-based approach and results in reduced forgetting, and more importantly, improved model explanations. We have evaluated our approach in the standard and few-shot settings and observed a consistent improvement across various CL approaches using different architectures and techniques to generate model explanations and demonstrated our approach showing a promising connection between explainability and continual learning. Our code is available at https://github.com/SaynaEbrahimi/Remembering-for-the-Right-Reasons.},
 author = {Ebrahimi, Sayna and Petryk, Suzanne and Gokul, Akash and Gan, William and Gonzalez, Joseph E. and Rohrbach, Marcus and Darrell, Trevor},
 bdsk-url-1 = {https://onlinelibrary.wiley.com/doi/abs/10.1002/ail2.44},
 bdsk-url-2 = {https://doi.org/10.1002/ail2.44},
 code = {https://github.com/SaynaEbrahimi/},
 doi = {https://doi.org/10.1002/ail2.44},
 eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/ail2.44},
 journal = {Applied AI Letters},
 keywords = {peerrev, selected},
 number = {4},
 pages = {e44},
 title = {Remembering for the right reasons: Explanations reduce catastrophic forgetting},
 url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/ail2.44},
 volume = {2},
 year = {2021}
}

Zhanghao Wu, Paras Jain, Matthew Wright, Azalia Mirhoseini, Joseph E Gonzalez, and Ion Stoica. "Representing Long-Range Context for Graph Neural Networks with Global Attention." Advances in Neural Information Processing Systems, 2021.

Graph neural networks are powerful architectures for structured datasets. However, current methods struggle to represent long-range dependencies. Scaling the depth or width of GNNs is insufficient to broaden receptive fields as larger GNNs encounter optimization instabilities such as vanishing gradients and representation oversmoothing, while pooling-based approaches have yet to become as universally useful as in computer vision. In this work, we propose the use of Transformer-based self-attention to learn long-range pairwise relationships, with a novel ``readout" mechanism to obtain a global graph embedding. Inspired by recent computer vision results that find position-invariant attention performant in learning long-range relationships, our method, which we call GraphTrans, applies a permutation-invariant Transformer module after a standard GNN module. This simple architecture leads to state-of-the-art results on several graph classification tasks, outperforming methods that explicitly encode graph structure. Our results suggest that purely-learning-based approaches without graph structure may be suitable for learning high-level, long-range relationships on graphs. Code for GraphTrans is available at \url{https://github.com/ucbrise/graphtrans}.

@inproceedings{NEURIPS2021_6e67691b,
 abstract = {Graph neural networks are powerful architectures for structured datasets. However, current methods struggle to represent long-range dependencies. Scaling the depth or width of GNNs is insufficient to broaden receptive fields as larger GNNs encounter optimization instabilities such as vanishing gradients and representation oversmoothing, while pooling-based approaches have yet to become as universally useful as in computer vision. In this work, we propose the use of Transformer-based self-attention to learn long-range pairwise relationships, with a novel ``readout" mechanism to obtain a global graph embedding. Inspired by recent computer vision results that find position-invariant attention performant in learning long-range relationships, our method, which we call GraphTrans, applies a permutation-invariant Transformer module after a standard GNN module. This simple architecture leads to state-of-the-art results on several graph classification tasks, outperforming methods that explicitly encode graph structure. Our results suggest that purely-learning-based approaches without graph structure may be suitable for learning high-level, long-range relationships on graphs. Code for GraphTrans is available at \url{https://github.com/ucbrise/graphtrans}.},
 author = {Wu, Zhanghao and Jain, Paras and Wright, Matthew and Mirhoseini, Azalia and Gonzalez, Joseph E and Stoica, Ion},
 bdsk-url-1 = {https://proceedings.neurips.cc/paper/2021/file/6e67691b60ed3e4a55935261314dd534-Paper.pdf},
 booktitle = {Advances in Neural Information Processing Systems},
 code = {https://github.com/ucbrise/graphtrans},
 editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan},
 keywords = {peerrev, selected},
 pages = {13266--13279},
 publisher = {Curran Associates, Inc.},
 title = {Representing Long-Range Context for Graph Neural Networks with Global Attention},
 url = {https://proceedings.neurips.cc/paper/2021/file/6e67691b60ed3e4a55935261314dd534-Paper.pdf},
 volume = {34},
 year = {2021}
}

Ujval Misra, Richard Liaw, Lisa Dunlap, Romil Bhardwaj, Kirthevasan Kandasamy, Joseph E. Gonzalez, Ion Stoica, and Alexey Tumanov. "RubberBand: Cloud-Based Hyperparameter Tuning." Proceedings of the Sixteenth European Conference on Computer Systems, 2021.

Hyperparameter tuning is essential to achieving state-of-the-art accuracy in machine learning (ML), but requires substantial compute resources to perform. Existing systems primarily focus on effectively allocating resources for a hyperparameter tuning job under fixed resource constraints. We show that the available parallelism in such jobs changes dynamically over the course of execution and, therefore, presents an opportunity to leverage the elasticity of the cloud. In particular, we address the problem of minimizing the financial cost of executing a hyperparameter tuning job, subject to a time constraint. We present RubberBand---the first framework for cost-efficient, elastic execution of hyperparameter tuning jobs in the cloud. RubberBand utilizes performance instrumentation and cloud pricing to model job completion time and cost prior to runtime, and generate a cost-efficient, elastic resource allocation plan. RubberBand is able to efficiently execute this plan and realize a cost reduction of up to 2x in comparison to static allocation baselines.

@inproceedings{Misra21,
 abstract = {Hyperparameter tuning is essential to achieving state-of-the-art accuracy in machine learning (ML), but requires substantial compute resources to perform. Existing systems primarily focus on effectively allocating resources for a hyperparameter tuning job under fixed resource constraints. We show that the available parallelism in such jobs changes dynamically over the course of execution and, therefore, presents an opportunity to leverage the elasticity of the cloud.
In particular, we address the problem of minimizing the financial cost of executing a hyperparameter tuning job, subject to a time constraint. We present RubberBand---the first framework for cost-efficient, elastic execution of hyperparameter tuning jobs in the cloud. RubberBand utilizes performance instrumentation and cloud pricing to model job completion time and cost prior to runtime, and generate a cost-efficient, elastic resource allocation plan. RubberBand is able to efficiently execute this plan and realize a cost reduction of up to 2x in comparison to static allocation baselines.},
 address = {New York, NY, USA},
 author = {Misra, Ujval and Liaw, Richard and Dunlap, Lisa and Bhardwaj, Romil and Kandasamy, Kirthevasan and Gonzalez, Joseph E. and Stoica, Ion and Tumanov, Alexey},
 bdsk-url-1 = {https://doi.org/10.1145/3447786.3456245},
 booktitle = {Proceedings of the Sixteenth European Conference on Computer Systems},
 doi = {10.1145/3447786.3456245},
 isbn = {9781450383349},
 keywords = {peerrev, selected, hyperparameter optimization, distributed machine learning},
 location = {Online Event, United Kingdom},
 numpages = {16},
 pages = {327--342},
 publisher = {Association for Computing Machinery},
 series = {EuroSys '21},
 title = {RubberBand: Cloud-Based Hyperparameter Tuning},
 url = {https://doi.org/10.1145/3447786.3456245},
 year = {2021}
}

Yaoqing Yang, Liam Hodgkinson, Ryan Theisen, Joe Zou, Joseph E Gonzalez, Kannan Ramchandran, and Michael W Mahoney. "Taxonomizing local versus global structure in neural network loss landscapes." Advances in Neural Information Processing Systems, 2021.

Viewing neural network models in terms of their loss landscapes has a long history in the statistical mechanics approach to learning, and in recent years it has received attention within machine learning proper. Among other things, local metrics (such as the smoothness of the loss landscape) have been shown to correlate with global properties of the model (such as good generalization performance). Here, we perform a detailed empirical analysis of the loss landscape structure of thousands of neural network models, systematically varying learning tasks, model architectures, and/or quantity/quality of data. By considering a range of metrics that attempt to capture different aspects of the loss landscape, we demonstrate that the best test accuracy is obtained when: the loss landscape is globally well-connected; ensembles of trained models are more similar to each other; and models converge to locally smooth regions. We also show that globally poorly-connected landscapes can arise when models are small or when they are trained to lower quality data; and that, if the loss landscape is globally poorly-connected, then training to zero loss can actually lead to worse test accuracy. Our detailed empirical results shed light on phases of learning (and consequent double descent behavior), fundamental versus incidental determinants of good generalization, the role of load-like and temperature-like parameters in the learning process, different influences on the loss landscape from model and data, and the relationships between local and global metrics, all topics of recent interest.

@inproceedings{NEURIPS2021_9b72e31d,
 abstract = {Viewing neural network models in terms of their loss landscapes has a long history in the statistical mechanics approach to learning, and in recent years it has received attention within machine learning proper. Among other things, local metrics (such as the smoothness of the loss landscape) have been shown to correlate with global properties of the model (such as good generalization performance). Here, we perform a detailed empirical analysis of the loss landscape structure of thousands of neural network models, systematically varying learning tasks, model architectures, and/or quantity/quality of data. By considering a range of metrics that attempt to capture different aspects of the loss landscape, we demonstrate that the best test accuracy is obtained when: the loss landscape is globally well-connected; ensembles of trained models are more similar to each other; and models converge to locally smooth regions. We also show that globally poorly-connected landscapes can arise when models are small or when they are trained to lower quality data; and that, if the loss landscape is globally poorly-connected, then training to zero loss can actually lead to worse test accuracy. Our detailed empirical results shed light on phases of learning (and consequent double descent behavior), fundamental versus incidental determinants of good generalization, the role of load-like and temperature-like parameters in the learning process, different influences on the loss landscape from model and data, and the relationships between local and global metrics, all topics of recent interest.},
 author = {Yang, Yaoqing and Hodgkinson, Liam and Theisen, Ryan and Zou, Joe and Gonzalez, Joseph E and Ramchandran, Kannan and Mahoney, Michael W},
 bdsk-url-1 = {https://proceedings.neurips.cc/paper/2021/file/9b72e31dac81715466cd580a448cf823-Paper.pdf},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {M. Ranzato and A. Beygelzimer and Y. Dauphin and P.S. Liang and J. Wortman Vaughan},
 keywords = {peerrev, selected},
 pages = {18722--18733},
 publisher = {Curran Associates, Inc.},
 title = {Taxonomizing local versus global structure in neural network loss landscapes},
 url = {https://proceedings.neurips.cc/paper/2021/file/9b72e31dac81715466cd580a448cf823-Paper.pdf},
 volume = {34},
 year = {2021}
}

Lianmin Zheng, Ruochen Liu, Junru Shao, Tianqi Chen, Joseph E. Gonzalez, Ion Stoica, and Ameer Haj Ali. "TenSet: A Large-scale Program Performance Dataset for Learned Tensor Compilers." Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1), 2021.

Search-based tensor compilers can greatly accelerate the execution of machine learning models by generating high-performance tensor programs, such as matrix multiplications and convolutions. These compilers take a high-level mathematical expression as input and search for the fastest low-level implementations. At the core of the search procedure is a cost model which estimates the performance of different candidates to reduce the frequency of time-consuming on-device measurements. There has been a growing interest in using machine learning techniques to learn a cost model to ease the effort of building an analytical model. However, a standard dataset for pre-training and benchmarking learned cost models is lacking. We introduce TenSet, a large-scale tensor program performance dataset. TenSet contains 52 million program performance records collected from 6 hardware platforms. We provide comprehensive studies on how to learn and evaluate the cost models, including data collection, model architectures, loss functions, transfer learning, and evaluation metrics. We also show that a cost model pre-trained on TenSet can accelerate the search time in the state-of-the-art tensor compiler by up to 10x. The dataset is available at \url{https://github.com/tlc-pack/tenset}.

@inproceedings{zheng2021tenset,
 abstract = {Search-based tensor compilers can greatly accelerate the execution of machine learning models by generating high-performance tensor programs, such as matrix multiplications and convolutions. These compilers take a high-level mathematical expression as input and search for the fastest low-level implementations. At the core of the search procedure is a cost model which estimates the performance of different candidates to reduce the frequency of time-consuming on-device measurements. There has been a growing interest in using machine learning techniques to learn a cost model to ease the effort of building an analytical model. However, a standard dataset for pre-training and benchmarking learned cost models is lacking.

We introduce TenSet, a large-scale tensor program performance dataset. TenSet contains 52 million program performance records collected from 6 hardware platforms. We provide comprehensive studies on how to learn and evaluate the cost models, including data collection, model architectures, loss functions, transfer learning, and evaluation metrics. We also show that a cost model pre-trained on TenSet can accelerate the search time in the state-of-the-art tensor compiler by up to 10x. The dataset is available at \url{https://github.com/tlc-pack/tenset}.},
 author = {Lianmin Zheng and Ruochen Liu and Junru Shao and Tianqi Chen and Joseph E. Gonzalez and Ion Stoica and Ameer Haj Ali},
 bdsk-url-1 = {https://openreview.net/forum?id=aIfp8kLuvc9},
 booktitle = {Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)},
 code = {https://github.com/tlc-pack/tenset},
 keywords = {peerrev, selected},
 title = {TenSet: A Large-scale Program Performance Dataset for Learned Tensor Compilers},
 url = {https://openreview.net/forum?id=aIfp8kLuvc9},
 year = {2021}
}

Alan Pham, Eunice Chan, Vikranth Srivatsa, Dhruba Ghosh, Yaoqing Yang, Yaodong Yu, Ruiqi Zhong, Joseph E. Gonzalez, and Jacob Steinhardt. "The Effect of Model Size on Worst-Group Generalization." arXiv, 2021.

Overparameterization is shown to result in poor test accuracy on rare subgroups under a variety of settings where subgroup information is known. To gain a more complete picture, we consider the case where subgroup information is unknown. We investigate the effect of model size on worst-group generalization under empirical risk minimization (ERM) across a wide range of settings, varying: 1) architectures (ResNet, VGG, or BERT), 2) domains (vision or natural language processing), 3) model size (width or depth), and 4) initialization (with pre-trained or random weights). Our systematic evaluation reveals that increasing model size does not hurt, and may help, worst-group test performance under ERM across all setups. In particular, increasing pre-trained model size consistently improves performance on Waterbirds and MultiNLI. We advise practitioners to use larger pre-trained models when subgroup labels are unknown.

@misc{Pham21,
 abstract = {Overparameterization is shown to result in poor test accuracy on rare subgroups under a variety of settings where subgroup information is known. To gain a more complete picture, we consider the case where subgroup information is unknown. We investigate the effect of model size on worst-group generalization under empirical risk minimization (ERM) across a wide range of settings, varying: 1) architectures (ResNet, VGG, or BERT), 2) domains (vision or natural language processing), 3) model size (width or depth), and 4) initialization (with pre-trained or random weights). Our systematic evaluation reveals that increasing model size does not hurt, and may help, worst-group test performance under ERM across all setups. In particular, increasing pre-trained model size consistently improves performance on Waterbirds and MultiNLI. We advise practitioners to use larger pre-trained models when subgroup labels are unknown.},
 author = {Pham, Alan and Chan, Eunice and Srivatsa, Vikranth and Ghosh, Dhruba and Yang, Yaoqing and Yu, Yaodong and Zhong, Ruiqi and Gonzalez, Joseph E. and Steinhardt, Jacob},
 bdsk-url-1 = {https://arxiv.org/abs/2112.04094},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2112.04094},
 copyright = {arXiv.org perpetual, non-exclusive license},
 doi = {10.48550/ARXIV.2112.04094},
 keywords = {arxivpre, Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
 publisher = {arXiv},
 title = {The Effect of Model Size on Worst-Group Generalization},
 url = {https://arxiv.org/abs/2112.04094},
 year = {2021}
}

Nathan Pemberton, Johann Schleier-Smith, and Joseph E. Gonzalez. "The RESTless Cloud." Proceedings of the Workshop on Hot Topics in Operating Systems, 2021.

Cloud provider APIs have emerged as the de facto operating system interface for the warehouse scale computers that comprise the public cloud. Like single-server operating systems, they provide the resource allocation, protection, communication paths, naming, and scheduling for these large machines. Cloud provider APIs also provide all sorts of things that operating systems do not, things like big data analytics, machine learning model training, or factory automation. Somewhere, lurking within this menagerie of services, there is an operating system interface to a really big computer, the computer that today's application developers target. This computer works nothing like a single server, yet it also isn't a dispersed distributed system like the internet. It is something in-between. Now is the time to distill and refine a coherent "cloud system interface" from the multitude of cloud provider APIs, preferably a portable one. In this paper we discuss what goes in, what stays out, and the principles that inform these decisions.

@inproceedings{Pemberton21,
 abstract = {Cloud provider APIs have emerged as the de facto operating system interface for the warehouse scale computers that comprise the public cloud. Like single-server operating systems, they provide the resource allocation, protection, communication paths, naming, and scheduling for these large machines. Cloud provider APIs also provide all sorts of things that operating systems do not, things like big data analytics, machine learning model training, or factory automation. Somewhere, lurking within this menagerie of services, there is an operating system interface to a really big computer, the computer that today's application developers target. This computer works nothing like a single server, yet it also isn't a dispersed distributed system like the internet. It is something in-between. Now is the time to distill and refine a coherent "cloud system interface" from the multitude of cloud provider APIs, preferably a portable one. In this paper we discuss what goes in, what stays out, and the principles that inform these decisions.},
 address = {New York, NY, USA},
 author = {Pemberton, Nathan and Schleier-Smith, Johann and Gonzalez, Joseph E.},
 bdsk-url-1 = {https://doi.org/10.1145/3458336.3465280},
 booktitle = {Proceedings of the Workshop on Hot Topics in Operating Systems},
 doi = {10.1145/3458336.3465280},
 isbn = {9781450384384},
 keywords = {peerrev, selected},
 location = {Ann Arbor, Michigan},
 numpages = {9},
 pages = {49--57},
 publisher = {Association for Computing Machinery},
 series = {HotOS '21},
 title = {The RESTless Cloud},
 url = {https://doi.org/10.1145/3458336.3465280},
 year = {2021}
}

Matthew A. Wright, and Joseph E. Gonzalez. "Transformers are Deep Infinite-Dimensional Non-Mercer Binary Kernel Machines." arXiv, 2021.

Despite their ubiquity in core AI fields like natural language processing, the mechanics of deep attention-based neural networks like the Transformer model are not fully understood. In this article, we present a new perspective towards understanding how Transformers work. In particular, we show that the "dot-product attention" that is the core of the Transformer's operation can be characterized as a kernel learning method on a pair of Banach spaces. In particular, the Transformer's kernel is characterized as having an infinite feature dimension. Along the way we consider an extension of the standard kernel learning problem to a binary setting, where data come from two input domains and a response is defined for every cross-domain pair. We prove a new representer theorem for these binary kernel machines with non-Mercer (indefinite, asymmetric) kernels (implying that the functions learned are elements of reproducing kernel Banach spaces rather than Hilbert spaces), and also prove a new universal approximation theorem showing that the Transformer calculation can learn any binary non-Mercer reproducing kernel Banach space pair. We experiment with new kernels in Transformers, and obtain results that suggest the infinite dimensionality of the standard Transformer kernel is partially responsible for its performance. This paper's results provide a new theoretical understanding of a very important but poorly understood model in modern machine~learning.

@misc{Wright21,
 abstract = {Despite their ubiquity in core AI fields like natural language processing, the mechanics of deep attention-based neural networks like the Transformer model are not fully understood. In this article, we present a new perspective towards understanding how Transformers work. In particular, we show that the "dot-product attention" that is the core of the Transformer's operation can be characterized as a kernel learning method on a pair of Banach spaces. In particular, the Transformer's kernel is characterized as having an infinite feature dimension. Along the way we consider an extension of the standard kernel learning problem to a binary setting, where data come from two input domains and a response is defined for every cross-domain pair. We prove a new representer theorem for these binary kernel machines with non-Mercer (indefinite, asymmetric) kernels (implying that the functions learned are elements of reproducing kernel Banach spaces rather than Hilbert spaces), and also prove a new universal approximation theorem showing that the Transformer calculation can learn any binary non-Mercer reproducing kernel Banach space pair. We experiment with new kernels in Transformers, and obtain results that suggest the infinite dimensionality of the standard Transformer kernel is partially responsible for its performance. This paper's results provide a new theoretical understanding of a very important but poorly understood model in modern machine~learning.},
 author = {Wright, Matthew A. and Gonzalez, Joseph E.},
 bdsk-url-1 = {https://arxiv.org/abs/2106.01506},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2106.01506},
 copyright = {arXiv.org perpetual, non-exclusive license},
 doi = {10.48550/ARXIV.2106.01506},
 keywords = {arxivpre, Machine Learning (cs.LG), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
 publisher = {arXiv},
 title = {Transformers are Deep Infinite-Dimensional Non-Mercer Binary Kernel Machines},
 url = {https://arxiv.org/abs/2106.01506},
 year = {2021}
}

Priya Sundaresan, Jennifer Grannen, Brijen Thananjeyan, Ashwin Balakrishna, Jeffrey Ichnowski, Ellen Novoseller, Minho Hwang, Michael Laskey, Joseph E. Gonzalez, and Ken Goldberg. "Untangling Dense Non-Planar Knots by Learning Manipulation Features and Recovery Policies." arXiv, 2021.

Robot manipulation for untangling 1D deformable structures such as ropes, cables, and wires is challenging due to their infinite dimensional configuration space, complex dynamics, and tendency to self-occlude. Analytical controllers often fail in the presence of dense configurations, due to the difficulty of grasping between adjacent cable segments. We present two algorithms that enhance robust cable untangling, LOKI and SPiDERMan, which operate alongside HULK, a high-level planner from prior work. LOKI uses a learned model of manipulation features to refine a coarse grasp keypoint prediction to a precise, optimized location and orientation, while SPiDERMan uses a learned model to sense task progress and apply recovery actions. We evaluate these algorithms in physical cable untangling experiments with 336 knots and over 1500 actions on real cables using the da Vinci surgical robot. We find that the combination of HULK, LOKI, and SPiDERMan is able to untangle dense overhand, figure-eight, double-overhand, square, bowline, granny, stevedore, and triple-overhand knots. The composition of these methods successfully untangles a cable from a dense initial configuration in $68.3\%$ of 60 physical experiments and achieves $50\%$ higher success rates than baselines from prior work. Supplementary material, code, and videos can be found at \url{https://sites.google.com/berkeley.edu/non-planar-untangling}.

@misc{Sundaresan21,
 abstract = {Robot manipulation for untangling 1D deformable structures such as ropes, cables, and wires is challenging due to their infinite dimensional configuration space, complex dynamics, and tendency to self-occlude. Analytical controllers often fail in the presence of dense configurations, due to the difficulty of grasping between adjacent cable segments. We present two algorithms that enhance robust cable untangling, LOKI and SPiDERMan, which operate alongside HULK, a high-level planner from prior work. LOKI uses a learned model of manipulation features to refine a coarse grasp keypoint prediction to a precise, optimized location and orientation, while SPiDERMan uses a learned model to sense task progress and apply recovery actions. We evaluate these algorithms in physical cable untangling experiments with 336 knots and over 1500 actions on real cables using the da Vinci surgical robot. We find that the combination of HULK, LOKI, and SPiDERMan is able to untangle dense overhand, figure-eight, double-overhand, square, bowline, granny, stevedore, and triple-overhand knots. The composition of these methods successfully untangles a cable from a dense initial configuration in $68.3\%$ of 60 physical experiments and achieves $50\%$ higher success rates than baselines from prior work. Supplementary material, code, and videos can be found at \url{https://sites.google.com/berkeley.edu/non-planar-untangling}.},
 author = {Sundaresan, Priya and Grannen, Jennifer and Thananjeyan, Brijen and Balakrishna, Ashwin and Ichnowski, Jeffrey and Novoseller, Ellen and Hwang, Minho and Laskey, Michael and Gonzalez, Joseph E. and Goldberg, Ken},
 bdsk-url-1 = {https://arxiv.org/abs/2107.08942},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2107.08942},
 code = {https://sites.google.com/berkeley.edu/non-planar-untangling},
 copyright = {arXiv.org perpetual, non-exclusive license},
 doi = {10.48550/ARXIV.2107.08942},
 keywords = {arxivpre, Robotics (cs.RO), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
 publisher = {arXiv},
 title = {Untangling Dense Non-Planar Knots by Learning Manipulation Features and Recovery Policies},
 url = {https://arxiv.org/abs/2107.08942},
 year = {2021}
}

Guanhua Wang, Zhuang Liu, Brandon Hsieh, Siyuan Zhuang, Joseph Gonzalez, Trevor Darrell, and Ion Stoica. "sensAI: ConvNets Decomposition via Class Parallelism for Fast Inference on Live Data." Proceedings of Machine Learning and Systems, 2021.

Convolutional Neural Networks (ConvNets) enable computers to excel on vision learning tasks such as image classification, object detection. Recently, real-time inference on live data is becoming more and more important. From a system perspective, it requires fast inference on each single, incoming data item (e.g. 1 image). Two main-stream distributed model serving paradigms -- data parallelism and model parallelism -- are not necessarily desirable here, because we cannot further split a single input data piece via data parallelism, and model parallelism introduces huge communication overhead. To achieve live data inference with low latency, we propose sensAI, a novel and generic approach that decouples a CNN model into disconnected subnets, each is responsible for predicting certain class(es). We call this new model distribution paradigm as class parallelism. Experimental results show that, sensAI achieves up to 18x faster inference on single input data item with no or negligible accuracy loss on CIFAR-10, CIFAR-100 and ImageNet-1K datasets.

@inproceedings{Guanhua21,
 abstract = {Convolutional Neural Networks (ConvNets) enable computers to excel on vision learning tasks such as image classification, object detection. Recently, real-time inference on live data is becoming more and more important. From a system perspective, it requires fast inference on each single, incoming data item (e.g. 1 image). Two main-stream distributed model serving paradigms -- data parallelism and model parallelism -- are not necessarily desirable here, because we cannot further split a single input data piece via data parallelism, and model parallelism introduces huge communication overhead. To achieve live data inference with low latency, we propose sensAI, a novel and generic approach that decouples a CNN model into disconnected subnets, each is responsible for predicting certain class(es). We call this new model distribution paradigm as class parallelism. Experimental results show that, sensAI achieves up to 18x faster inference on single input data item with no or negligible accuracy loss on CIFAR-10, CIFAR-100 and ImageNet-1K datasets.},
 author = {Wang, Guanhua and Liu, Zhuang and Hsieh, Brandon and Zhuang, Siyuan and Gonzalez, Joseph and Darrell, Trevor and Stoica, Ion},
 bdsk-url-1 = {https://proceedings.mlsys.org/paper/2021/file/c4ca4238a0b923820dcc509a6f75849b-Paper.pdf},
 booktitle = {Proceedings of Machine Learning and Systems},
 editor = {A. Smola and A. Dimakis and I. Stoica},
 keywords = {peerrev, selected},
 pages = {664--679},
 title = {sensAI: ConvNets Decomposition via Class Parallelism for Fast Inference on Live Data},
 url = {https://proceedings.mlsys.org/paper/2021/file/c4ca4238a0b923820dcc509a6f75849b-Paper.pdf},
 volume = {3},
 year = {2021}
}

Vidit Saxena, Joakim Jalden, and Joseph E. Gonzalez. "Thompson Sampling for Linearly Constrained Bandits." Proceedings of the Twenty Third International Conference on Artificial Intelligence and Statistics (AIStats), 2020.

We address multi-armed bandits (MAB) where the objective is to maximize the cumulative reward under a probabilistic linear constraint. For a few real-world instances of this problem, constrained extensions of the well-known Thompson Sampling (TS) heuristic have recently been proposed. However, finite-time analysis of constrained TS is challenging; as a result, only O( sqrt( T ) ) bounds on the cumulative reward loss (i.e., the regret) are available. In this paper, we describe LinConTS, a TS-based algorithm for bandits that place a linear constraint on the probability of earning a reward in every round. We show that for LinConTS, the regret as well as the cumulative constraint violations are upper bounded by O( log ( T ) ). We develop a proof technique that relies on careful analysis of the dual problem and combine it with recent theoretical work on unconstrained TS. Through numerical experiments on two real-world datasets, we demonstrate that LinConTS outperforms an asymptotically optimal upper confidence bound (UCB) scheme in terms of simultaneously minimizing the regret and the violation.

@inproceedings{pmlr-v108-saxena20a,
 abstract = {We address multi-armed bandits (MAB) where the objective is to maximize the cumulative reward under a probabilistic linear constraint. For a few real-world instances of this problem, constrained extensions of the well-known Thompson Sampling (TS) heuristic have recently been proposed. However, finite-time analysis of constrained TS is challenging; as a result, only O( sqrt( T ) ) bounds on the cumulative reward loss (i.e., the regret) are available. In this paper, we describe LinConTS, a TS-based algorithm for bandits that place a linear constraint on the probability of earning a reward in every round. We show that for LinConTS, the regret as well as the cumulative constraint violations are upper bounded by O( log ( T ) ). We develop a proof technique that relies on careful analysis of the dual problem and combine it with recent theoretical work on unconstrained TS. Through numerical experiments on two real-world datasets, we demonstrate that LinConTS outperforms an asymptotically optimal upper confidence bound (UCB) scheme in terms of simultaneously minimizing the regret and the violation.},
 address = {Online},
 author = {Vidit Saxena and Joakim Jalden and Joseph E. Gonzalez},
 bdsk-url-1 = {http://proceedings.mlr.press/v108/saxena20a.html},
 booktitle = {Proceedings of the Twenty Third International Conference on Artificial Intelligence and Statistics (AIStats)},
 date-modified = {2020-08-02 11:27:35 -0700},
 editor = {Chiappa, Silvia and Calandra, Roberto},
 keywords = {peerrev},
 month = {8},
 pages = {1999--2009},
 pdf = {http://proceedings.mlr.press/v108/saxena20a/saxena20a.pdf},
 publisher = {PMLR},
 series = {Proceedings of Machine Learning Research},
 title = {Thompson Sampling for Linearly Constrained Bandits},
 url = {http://proceedings.mlr.press/v108/saxena20a.html},
 volume = {108},
 year = {2020}
}

Daniel Rothchild, Ashwinee Panda, Enayat Ullah, Nikita Ivkin, Ion Stoica, Vladimir Braverman, Joseph E. Gonzalez, and Raman Arora. "FetchSGD: Communication-Efficient Federated Learning with Sketching." Proceedings of the International Conference on Machine Learning (ICML), 2020.

Existing approaches to federated learning suffer from a communication bottleneck as well as convergence issues due to sparse client participation. In this paper we introduce a novel algorithm, called FetchSGD, to overcome these challenges. FetchSGD compresses model updates using a Count Sketch, and then takes advantage of the mergeability of sketches to combine model updates from many workers. A key insight in the design of FetchSGD is that, because the Count Sketch is linear, momentum and error accumulation can both be carried out within the sketch. This allows the algorithm to move momentum and error accumulation from clients to the central aggregator, overcoming the challenges of sparse client participation while still achieving high compression rates and good convergence. We prove that FetchSGD has favorable convergence guarantees, and we demonstrate its empirical effectiveness by training two residual networks and a transformer model.

@inproceedings{FetchSGDICML20,
 abstract = {Existing approaches to federated learning suffer from a communication bottleneck as well as convergence issues due to sparse client participation. In this paper we introduce a novel algorithm, called FetchSGD, to overcome these challenges. FetchSGD compresses model updates using a Count Sketch, and then takes advantage of the mergeability of sketches to combine model updates from many workers. A key insight in the design of FetchSGD is that, because the Count Sketch is linear, momentum and error accumulation can both be carried out within the sketch. This allows the algorithm to move momentum and error accumulation from clients to the central aggregator, overcoming the challenges of sparse client participation while still achieving high compression rates and good convergence. We prove that FetchSGD has favorable convergence guarantees, and we demonstrate its empirical effectiveness by training two residual networks and a transformer model.},
 author = {Daniel Rothchild and Ashwinee Panda and Enayat Ullah and Nikita Ivkin and Ion Stoica and Vladimir Braverman and Joseph E. Gonzalez and Raman Arora},
 booktitle = {Proceedings of the International Conference on Machine Learning (ICML)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {7},
 series = {ICML'20},
 title = { {FetchSGD}: Communication-Efficient Federated Learning with Sketching},
 url = {https://proceedings.icml.cc/static/paper\%5Ffiles/icml/2020/5927-Paper.pdf},
 year = {2020}
}

Xin Wang, Thomas E. Huang, Trevor Darrell, Joseph E. Gonzalez, and Fisher Yu. "Frustratingly Simple Few-Shot Object Detection." Proceedings of the International Conference on Machine Learning (ICML), 2020.

Detecting rare objects from a few examples is an emerging problem. Prior works show meta-learning is a promising approach. But, fine-tuning techniques have drawn scant attention. We find that fine-tuning only the last layer of existing detectors on rare classes is crucial to the few-shot object detection task. Such a simple approach outperforms the meta-learning methods by roughly 2\~20 points on current benchmarks and sometimes even doubles the accuracy of the prior methods. However, the high variance in the few samples often leads to the unreliability of existing benchmarks. We revise the evaluation protocols by sampling multiple groups of training examples to obtain stable comparisons and build new benchmarks based on three datasets: PASCAL VOC, COCO and LVIS. Again, our fine-tuning approach establishes a new state of the art on the revised benchmarks. The code as well as the pretrained models are available at this URL.

@inproceedings{FSFewShotICML20,
 abstract = {Detecting rare objects from a few examples is an emerging problem. Prior works show meta-learning is a promising approach. But, fine-tuning techniques have drawn scant attention. We find that fine-tuning only the last layer of existing detectors on rare classes is crucial to the few-shot object detection task. Such a simple approach outperforms the meta-learning methods by roughly 2\~20 points on current benchmarks and sometimes even doubles the accuracy of the prior methods. However, the high variance in the few samples often leads to the unreliability of existing benchmarks. We revise the evaluation protocols by sampling multiple groups of training examples to obtain stable comparisons and build new benchmarks based on three datasets: PASCAL VOC, COCO and LVIS. Again, our fine-tuning approach establishes a new state of the art on the revised benchmarks. The code as well as the pretrained models are available at this URL.},
 author = {Xin Wang and Thomas E. Huang and Trevor Darrell and Joseph E. Gonzalez and Fisher Yu},
 booktitle = {Proceedings of the International Conference on Machine Learning (ICML)},
 code = {https://github.com/ucbdrive/few-shot-object-detection},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {7},
 series = {ICML'20},
 title = {Frustratingly Simple Few-Shot Object Detection},
 url = {https://proceedings.icml.cc/static/paper\%5Ffiles/icml/2020/2957-Paper.pdf},
 year = {2020}
}

Xiayue Charles Lin, Joseph E. Gonzalez, and Joseph M. Hellerstein. "Serverless Boom or Bust? An Analysis of Economic Incentives." 12th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud), 2020.

Serverless computing is a new paradigm that promises to free cloud users from the burden of having to provision and manage resources. However, the degree to which serverless computing will replace provisioned servers remains an open question. To address this, we develop an economic model that aims to quantify the value of serverless to providers and customers. A simple model of incentives for rational providers and customers allows us to see, in broad strokes, when and why serverless technologies are worth pursuing. By characterizing the conditions under which mutually beneficial economic incentives exist, our model suggests that many classes of customers can already benefit from switching to a serverless model and taking advantage of autoscaling at today's price points. Our model also helps characterize technical research directions that would be likely to have impact in the market.

@inproceedings{Lin20,
 abstract = {
Serverless computing is a new paradigm that promises to free cloud users from the burden of having to provision and manage resources. However, the degree to which serverless computing will replace provisioned servers remains an open question.

To address this, we develop an economic model that aims to quantify the value of serverless to providers and customers. A simple model of incentives for rational providers and customers allows us to see, in broad strokes, when and why serverless technologies are worth pursuing. By characterizing the conditions under which mutually beneficial economic incentives exist, our model suggests that many classes of customers can already benefit from switching to a serverless model and taking advantage of autoscaling at today's price points. Our model also helps characterize technical research directions that would be likely to have impact in the market.
},
 author = {Xiayue Charles Lin and Joseph E. Gonzalez and Joseph M. Hellerstein},
 bdsk-url-1 = {https://www.usenix.org/conference/hotcloud20/presentation/lin},
 booktitle = {12th {USENIX} Workshop on Hot Topics in Cloud Computing ({HotCloud})},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {7},
 publisher = { {USENIX} Association},
 title = {Serverless Boom or Bust? An Analysis of Economic Incentives},
 url = {https://www.usenix.org/conference/hotcloud20/presentation/lin},
 year = {2020}
}

Zhuohan Li, Eric Wallace, Sheng Shen, Kevin Lin, Kurt Keutzer, Dan Klein, and Joseph E. Gonzalez. "Train Big, Then Compress: Rethinking Model Size for Efficient Training and Inference of Transformers." Proceedings of the International Conference on Machine Learning (ICML), 2020.

Since hardware resources are limited, the objective of training deep learning models is typically to maximize accuracy subject to the time and memory constraints of training and inference. We study the impact of model size in this setting, focusing on Transformer models for NLP tasks that are limited by compute: self-supervised pretraining and high-resource machine translation. We first show that even though smaller Transformer models execute faster per iteration, wider and deeper models converge in significantly fewer steps. Moreover, this acceleration in convergence typically outpaces the additional computational overhead of using larger models. Therefore, the most compute-efficient training strategy is to counterintuitively train extremely large models but stop after a small number of iterations. This leads to an apparent trade-off between the training efficiency of large Transformer models and the inference efficiency of small Transformer models. However, we show that large models are more robust to compression techniques such as quantization and pruning than small models. Consequently, one can get the best of both worlds: heavily compressed, large models achieve higher accuracy than lightly compressed, small models.

@inproceedings{TrainBigICML20,
 abstract = {
Since hardware resources are limited, the objective of training deep learning models is typically to maximize accuracy subject to the time and memory constraints of training and inference. We study the impact of model size in this setting, focusing on Transformer models for NLP tasks that are limited by compute: self-supervised pretraining and high-resource machine translation. We first show that even though smaller Transformer models execute faster per iteration, wider and deeper models converge in significantly fewer steps. Moreover, this acceleration in convergence typically outpaces the additional computational overhead of using larger models. Therefore, the most compute-efficient training strategy is to counterintuitively train extremely large models but stop after a small number of iterations.

This leads to an apparent trade-off between the training efficiency of large Transformer models and the inference efficiency of small Transformer models. However, we show that large models are more robust to compression techniques such as quantization and pruning than small models. Consequently, one can get the best of both worlds: heavily compressed, large models achieve higher accuracy than lightly compressed, small models.
},
 author = {Zhuohan Li and Eric Wallace and Sheng Shen and Kevin Lin and Kurt Keutzer and Dan Klein and Joseph E. Gonzalez},
 bdsk-url-1 = {https://proceedings.icml.cc/static/paper%5C%5Ffiles/icml/2020/6626-Paper.pdf},
 booktitle = {Proceedings of the International Conference on Machine Learning (ICML)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev, selected},
 month = {7},
 series = {ICML'20},
 title = {Train Big, Then Compress: Rethinking Model Size for Efficient Training and Inference of Transformers},
 url = {https://proceedings.icml.cc/static/paper\%5Ffiles/icml/2020/6626-Paper.pdf},
 year = {2020}
}

Alvin Wan, Xiaoliang Dai, Peizhao Zhang, Zijian He, Yuandong Tian, Saining Xie, Bichen Wu, Matthew Yu, Tao Xu, Kan Chen, Peter Vajda, and Joseph E. Gonzalez. "FBNetV2: Differentiable Neural Architecture Search for Spatial and Channel Dimensions." Proceedings of the Conference on Computer Vision and Pattern Recognition (CVPR), 2020.

Differentiable Neural Architecture Search (DNAS) has demonstrated great success in designing state-of-the-art, efficient neural networks. However, DARTS-based DNAS's search space is small when compared to other search methods', since all candidate network layers must be explicitly instantiated in memory. To address this bottleneck, we propose a memory and computationally efficient DNAS variant: DMaskingNAS. This algorithm expands the search space by up to \$10^{14}\times\$ over conventional DNAS, supporting searches over spatial and channel dimensions that are otherwise prohibitively expensive: input resolution and number of filters. We propose a masking mechanism for feature map reuse, so that memory and computational costs stay nearly constant as the search space expands. Furthermore, we employ effective shape propagation to maximize per-FLOP or per-parameter accuracy. The searched FBNetV2s yield state-of-the-art performance when compared with all previous architectures. With up to 421x less search cost, DMaskingNAS finds models with 0.9\% higher accuracy, 15\% fewer FLOPs than MobileNetV3-Small; and with similar accuracy but 20\% fewer FLOPs than Efficient-B0. Furthermore, our FBNetV2 outperforms MobileNetV3 by 2.6\% in accuracy, with equivalent model size. FBNetV2 models are open-sourced at https://github.com/facebookresearch/mobile-vision.

@inproceedings{FBNetV2,
 abstract = {Differentiable Neural Architecture Search (DNAS) has demonstrated great success in designing state-of-the-art, efficient neural networks. However, DARTS-based DNAS's search space is small when compared to other search methods', since all candidate network layers must be explicitly instantiated in memory. To address this bottleneck, we propose a memory and computationally efficient DNAS variant: DMaskingNAS. This algorithm expands the search space by up to \$10^{14}\times\$ over conventional DNAS, supporting searches over spatial and channel dimensions that are otherwise prohibitively expensive: input resolution and number of filters. We propose a masking mechanism for feature map reuse, so that memory and computational costs stay nearly constant as the search space expands. Furthermore, we employ effective shape propagation to maximize per-FLOP or per-parameter accuracy. The searched FBNetV2s yield state-of-the-art performance when compared with all previous architectures. With up to 421x less search cost, DMaskingNAS finds models with 0.9\% higher accuracy, 15\% fewer FLOPs than MobileNetV3-Small; and with similar accuracy but 20\% fewer FLOPs than Efficient-B0. Furthermore, our FBNetV2 outperforms MobileNetV3 by 2.6\% in accuracy, with equivalent model size. FBNetV2 models are open-sourced at https://github.com/facebookresearch/mobile-vision.},
 author = {Alvin Wan and Xiaoliang Dai and Peizhao Zhang and Zijian He and Yuandong Tian and Saining Xie and Bichen Wu and Matthew Yu and Tao Xu and Kan Chen and Peter Vajda and Joseph E. Gonzalez},
 bdsk-url-1 = {https://arxiv.org/abs/2004.05565},
 booktitle = {Proceedings of the Conference on Computer Vision and Pattern Recognition (CVPR)},
 code = {https://github.com/facebookresearch/mobile-vision},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev, selected},
 month = {6},
 title = { {FBNetV2}: Differentiable Neural Architecture Search for Spatial and Channel Dimensions},
 url = {https://arxiv.org/abs/2004.05565},
 year = {2020}
}

Rolando Garcia, Eric Liu, Vikram Sreekanti, Bobby Yan, Anusha Dandamudi, Joseph E. Gonzalez, Joseph M. Hellerstein, and Koushik Sen. "Hindsight Logging for Model Training." Proc. VLDB Endow., 2020.

In modern Machine Learning, model training is an iterative, experimental process that can consume enormous computation resources and developer time. To aid in that process, experienced model developers log and visualize program variables during training runs. Exhaustive logging of all variables is infeasible, so developers are left to choose between slowing down training via extensive conservative logging, or letting training run fast via minimalist optimistic logging that may omit key information. As a compromise, optimistic logging can be accompanied by program checkpoints; this allows developers to add log statements post-hoc, and "replay" desired log statements from checkpoint---a process we refer to as hindsight logging. Unfortunately, hindsight logging raises tricky problems in data management and software engineering. Done poorly, hindsight logging can waste resources and generate technical debt embodied in multiple variants of training code. In this paper, we present methodologies for efficient and effective logging practices for model training, with a focus on techniques for hindsight logging. Our goal is for experienced model developers to learn and adopt these practices. To make this easier, we provide an open-source suite of tools for Fast Low-Overhead Recovery (flor) that embodies our design across three tasks: (i) efficient background logging in Python, (ii) adaptive periodic checkpointing, and (iii) an instrumentation library that codifies hindsight logging for efficient and automatic record-replay of model-training. Model developers can use each flor tool separately as they see fit, or they can use flor in hands-free mode, entrusting it to instrument their code end-to-end for efficient record-replay. Our solutions leverage techniques from physiological transaction logs and recovery in database systems. Evaluations on modern ML benchmarks demonstrate that flor can produce fast checkpointing with small user-specifiable overheads (e.g. 7\%), and still provide hindsight log replay times orders of magnitude faster than restarting training from scratch.

@article{rol2020hindsight,
 abstract = {In modern Machine Learning, model training is an iterative, experimental process that can consume enormous computation resources and developer time. To aid in that process, experienced model developers log and visualize program variables during training runs. Exhaustive logging of all variables is infeasible, so developers are left to choose between slowing down training via extensive conservative logging, or letting training run fast via minimalist optimistic logging that may omit key information. As a compromise, optimistic logging can be accompanied by program checkpoints; this allows developers to add log statements post-hoc, and "replay" desired log statements from checkpoint---a process we refer to as hindsight logging. Unfortunately, hindsight logging raises tricky problems in data management and software engineering. Done poorly, hindsight logging can waste resources and generate technical debt embodied in multiple variants of training code. In this paper, we present methodologies for efficient and effective logging practices for model training, with a focus on techniques for hindsight logging. Our goal is for experienced model developers to learn and adopt these practices. To make this easier, we provide an open-source suite of tools for Fast Low-Overhead Recovery (flor) that embodies our design across three tasks: (i) efficient background logging in Python, (ii) adaptive periodic checkpointing, and (iii) an instrumentation library that codifies hindsight logging for efficient and automatic record-replay of model-training. Model developers can use each flor tool separately as they see fit, or they can use flor in hands-free mode, entrusting it to instrument their code end-to-end for efficient record-replay. Our solutions leverage techniques from physiological transaction logs and recovery in database systems. Evaluations on modern ML benchmarks demonstrate that flor can produce fast checkpointing with small user-specifiable overheads (e.g. 7\%), and still provide hindsight log replay times orders of magnitude faster than restarting training from scratch.},
 author = {Rolando Garcia and Eric Liu and Vikram Sreekanti and Bobby Yan and Anusha Dandamudi and Joseph E. Gonzalez and Joseph M. Hellerstein and Koushik Sen},
 bdsk-url-1 = {https://doi.org/10.14778/3436905.3436925},
 doi = {10.14778/3436905.3436925},
 issn = {2150-8097},
 issue_date = {December 2020},
 journal = {Proc. VLDB Endow.},
 keywords = {peerrev},
 month = {12},
 number = {4},
 numpages = {12},
 pages = {682--693},
 publisher = {VLDB Endowment},
 title = {Hindsight Logging for Model Training},
 url = {https://doi.org/10.14778/3436905.3436925},
 volume = {14},
 year = {2020}
}

Daniel Crankshaw, Gur-Eyal Sela, Corey Zumar, Xiangxi Mo, Joseph E. Gonzalez, Ion Stoica, and Alexey Tumanov. "InferLine: ML Inference Pipeline Composition Framework." Proceedings of the ACM Symposium on Cloud Computing, 2020.

The dominant cost in production machine learning workloads is not training individual models but serving predictions from increasingly complex prediction pipelines spanning multiple models, machine learning frameworks, and parallel hardware accelerators. Due to the complex interaction between model configurations and parallel hardware, prediction pipelines are challenging to provision and costly to execute when serving interactive latency-sensitive applications. This challenge is exacerbated by the unpredictable dynamics of bursty workloads. In this paper we introduce InferLine, a system which efficiently provisions and executes ML inference pipelines subject to end-to-end latency constraints by proactively optimizing and reactively controlling per-model configuration in a fine-grained fashion. Unpredictable changes in the serving workload are dynamically and cost-optimally accommodated with minimal service level degradation. InferLine introduces (1) automated model profiling and pipeline lineage extraction, (2) a fine-grain, cost-minimizing pipeline configuration planner, and (3) a fine-grain reactive controller. InferLine is able to configure and deploy prediction pipelines across a wide range of workload patterns and latency goals. It outperforms coarse-grained configuration alternatives by up 7.6x in cost while achieving up to 32x lower SLO miss rate on real workloads and generalizes across state-of-the-art model serving frameworks.

@inproceedings{InferlineSOCC20,
 abstract = {
The dominant cost in production machine learning workloads is not training individual models but serving predictions from increasingly complex prediction pipelines spanning multiple models, machine learning frameworks, and parallel hardware accelerators. Due to the complex interaction between model configurations and parallel hardware, prediction pipelines are challenging to provision and costly to execute when serving interactive latency-sensitive applications. This challenge is exacerbated by the unpredictable dynamics of bursty workloads.

In this paper we introduce InferLine, a system which efficiently provisions and executes ML inference pipelines subject to end-to-end latency constraints by proactively optimizing and reactively controlling per-model configuration in a fine-grained fashion. Unpredictable changes in the serving workload are dynamically and cost-optimally accommodated with minimal service level degradation. InferLine introduces (1) automated model profiling and pipeline lineage extraction, (2) a fine-grain, cost-minimizing pipeline configuration planner, and (3) a fine-grain reactive controller. InferLine is able to configure and deploy prediction pipelines across a wide range of workload patterns and latency goals. It outperforms coarse-grained configuration alternatives by up 7.6x in cost while achieving up to 32x lower SLO miss rate on real workloads and generalizes across state-of-the-art model serving frameworks.
},
 author = {Daniel Crankshaw and Gur{-}Eyal Sela and Corey Zumar and Xiangxi Mo and Joseph E. Gonzalez and Ion Stoica and Alexey Tumanov},
 bdsk-url-1 = {http://arxiv.org/abs/1812.01776},
 booktitle = {Proceedings of the ACM Symposium on Cloud Computing},
 keywords = {peerrev},
 month = {11},
 publisher = {Association for Computing Machinery},
 series = { {SoCC} '20},
 title = { {InferLine}: {ML} Inference Pipeline Composition Framework},
 url = {http://arxiv.org/abs/1812.01776},
 year = {2020}
}

Ashwin Balakrishna, Brijen Thananjeyan, Jonathan Lee, Felix Li, Arsh Zahed, Joseph E. Gonzalez, and Ken Goldberg. "On-Policy Robot Imitation Learning from a Converging Supervisor." Proceedings of the Conference on Robot Learning, 2020.

Existing on-policy imitation learning algorithms, such as DAgger, assume access to a fixed supervisor. However, there are many settings where the supervisor may evolve during policy learning, such as a human performing a novel task or an improving algorithmic controller. We formalize imitation learning from a ``converging supervisor'' and provide sublinear static and dynamic regret guarantees against the best policy in hindsight with labels from the converged supervisor, even when labels during learning are only from intermediate supervisors. We then show that this framework is closely connected to a class of reinforcement learning (RL) algorithms known as dual policy iteration (DPI), which alternate between training a reactive learner with imitation learning and a model-based supervisor with data from the learner. Experiments suggest that when this framework is applied with the state-of-the-art deep model-based RL algorithm PETS as an improving supervisor, it outperforms deep RL baselines on continuous control tasks and provides up to an 80-fold speedup in policy evaluation.

@inproceedings{Balakrishna20,
 abstract = {Existing on-policy imitation learning algorithms, such as DAgger, assume access to a fixed supervisor. However, there are many settings where the supervisor may evolve during policy learning, such as a human performing a novel task or an improving algorithmic controller. We formalize imitation learning from a ``converging supervisor'' and provide sublinear static and dynamic regret guarantees against the best policy in hindsight with labels from the converged supervisor, even when labels during learning are only from intermediate supervisors. We then show that this framework is closely connected to a class of reinforcement learning (RL) algorithms known as dual policy iteration (DPI), which alternate between training a reactive learner with imitation learning and a model-based supervisor with data from the learner. Experiments suggest that when this framework is applied with the state-of-the-art deep model-based RL algorithm PETS as an improving supervisor, it outperforms deep RL baselines on continuous control tasks and provides up to an 80-fold speedup in policy evaluation.},
 author = {Ashwin Balakrishna and Brijen Thananjeyan and Jonathan Lee and Felix Li and Arsh Zahed and Joseph E. Gonzalez and Ken Goldberg},
 bdsk-url-1 = {http://proceedings.mlr.press/v100/balakrishna20a.html},
 booktitle = {Proceedings of the Conference on Robot Learning},
 date-modified = {2020-08-02 11:27:35 -0700},
 editor = {Kaelbling, Leslie Pack and Kragic, Danica and Sugiura, Komei},
 keywords = {peerrev},
 month = {10},
 pages = {24--41},
 pdf = {http://proceedings.mlr.press/v100/balakrishna20a/balakrishna20a.pdf},
 publisher = {PMLR},
 series = {Proceedings of Machine Learning Research},
 title = {On-Policy Robot Imitation Learning from a Converging Supervisor},
 url = {http://proceedings.mlr.press/v100/balakrishna20a.html},
 volume = {100},
 year = {2020}
}

Vikram Sreekanti, Chenggang Wu, Saurav Chhatrapati, Joseph E. Gonzalez, Joseph M. Hellerstein, and Jose M. Faleiro. "A Fault-Tolerance Shim for Serverless Computing." Proceedings of the Fifteenth European Conference on Computer Systems (EuroSys), 2020.

Serverless computing has grown in popularity in recent years, with an increasing number of applications being built on Functions-as-a-Service (FaaS) platforms. By default, FaaS platforms support retry-based fault tolerance, but this is insufficient for programs that modify shared state, as they can unwittingly persist partial sets of updates in case of failures. To address this challenge, we would like atomic visibility of the updates made by a FaaS application. In this paper, we present aft, an atomic fault tolerance shim for serverless applications. aft interposes between a commodity FaaS platform and storage engine and ensures atomic visibility of updates by enforcing the read atomic isolation guarantee. aft supports new protocols to guarantee read atomic isolation in the serverless setting. We demonstrate that aft introduces minimal overhead relative to existing storage engines and scales smoothly to thousands of requests per second, while preventing a significant number of consistency anomalies.

@inproceedings{AftEuroSys20,
 abstract = {
Serverless computing has grown in popularity in recent years, with an increasing number of applications being built on Functions-as-a-Service (FaaS) platforms. By default, FaaS platforms support retry-based fault tolerance, but this is insufficient for programs that modify shared state, as they can unwittingly persist partial sets of updates in case of failures. To address this challenge, we would like atomic visibility of the updates made by a FaaS application.

In this paper, we present aft, an atomic fault tolerance shim for serverless applications. aft interposes between a commodity FaaS platform and storage engine and ensures atomic visibility of updates by enforcing the read atomic isolation guarantee. aft supports new protocols to guarantee read atomic isolation in the serverless setting. We demonstrate that aft introduces minimal overhead relative to existing storage engines and scales smoothly to thousands of requests per second, while preventing a significant number of consistency anomalies.
},
 address = {New York, NY, USA},
 articleno = {15},
 author = {Vikram Sreekanti and Chenggang Wu and Saurav Chhatrapati and Joseph E. Gonzalez and Joseph M. Hellerstein and Jose M. Faleiro},
 bdsk-url-1 = {https://doi.org/10.1145/3342195.3387535},
 booktitle = {Proceedings of the Fifteenth European Conference on Computer Systems (EuroSys)},
 code = {https://github.com/vsreekanti/aft},
 date-modified = {2020-08-02 11:27:35 -0700},
 isbn = {9781450368827},
 keywords = {peerrev},
 location = {Heraklion, Greece},
 numpages = {15},
 publisher = {Association for Computing Machinery},
 series = {EuroSys '20},
 title = {A Fault-Tolerance Shim for Serverless Computing},
 url = {https://doi.org/10.1145/3342195.3387535},
 year = {2020}
}

Jianfei Chen, Yu Gai, Zhewei Yao, Michael W. Mahoney, and Joseph E. Gonzalez. "A Statistical Framework for Low-bitwidth Training of Deep Neural Networks." Advances in Neural Information Processing Systems, 2020.

Fully quantized training (FQT), which uses low-bitwidth hardware by quantizing the activations, weights, and gradients of a neural network model, is a promising approach to accelerate the training of deep neural networks. One major challenge with FQT is the lack of theoretical understanding, in particular of how gradient quantization impacts convergence properties. In this paper, we address this problem by presenting a statistical framework for analyzing FQT algorithms. We view the quantized gradient of FQT as a stochastic estimator of its full precision counterpart, a procedure known as quantization-aware training (QAT). We show that the FQT gradient is an unbiased estimator of the QAT gradient, and we discuss the impact of gradient quantization on its variance. Inspired by these theoretical results, we develop two novel gradient quantizers, and we show that these have smaller variance than the existing per-tensor quantizer. For training ResNet-50 on ImageNet, our 5-bit block Householder quantizer achieves only 0.5\% validation accuracy loss relative to QAT, comparable to the existing INT8 baseline.

@inproceedings{ChenGYMG20,
 abstract = {Fully quantized training (FQT), which uses low-bitwidth hardware by quantizing the activations, weights, and gradients of a neural network model, is a promising approach to accelerate the training of deep neural networks. One major challenge with FQT is the lack of theoretical understanding, in particular of how gradient quantization impacts convergence properties. In this paper, we address this problem by presenting a statistical framework for analyzing FQT algorithms. We view the quantized gradient of FQT as a stochastic estimator of its full precision counterpart, a procedure known as quantization-aware training (QAT). We show that the FQT gradient is an unbiased estimator of the QAT gradient, and we discuss the impact of gradient quantization on its variance. Inspired by these theoretical results, we develop two novel gradient quantizers, and we show that these have smaller variance than the existing per-tensor quantizer. For training ResNet-50 on ImageNet, our 5-bit block Householder quantizer achieves only 0.5\% validation accuracy loss relative to QAT, comparable to the existing INT8 baseline.},
 author = {Jianfei Chen and Yu Gai and Zhewei Yao and Michael W. Mahoney and Joseph E. Gonzalez},
 bdsk-url-1 = {https://proceedings.neurips.cc/paper/2020/file/099fe6b0b444c23836c4a5d07346082b-Paper.pdf},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
 keywords = {peerrev, selected},
 pages = {883--894},
 publisher = {Curran Associates, Inc.},
 title = {A Statistical Framework for Low-bitwidth Training of Deep Neural Networks},
 url = {https://proceedings.neurips.cc/paper/2020/file/099fe6b0b444c23836c4a5d07346082b-Paper.pdf},
 volume = {33},
 year = {2020}
}

Brijen Thananjeyan, Ashwin Balakrishna, Ugo Rosolia, Joseph E. Gonzalez, Aaron D. Ames, and Ken Goldberg. "ABC-LMPC: Safe Sample-Based Learning MPC for Stochastic Nonlinear Dynamical Systems with Adjustable Boundary Conditions." Proceedings of the Int. Workshop on the Algorithmic Foundations of Robotics (WAFR), 2020.

Sample-based learning model predictive control (LMPC) strategies have recently attracted attention due to their desirable theoretical properties and their good empirical performance on robotic tasks. However, prior analysis of LMPC controllers for stochastic systems has mainly focused on linear systems in the iterative learning control setting. We present a novel LMPC algorithm, Adjustable Boundary Condition LMPC (ABC-LMPC), which enables rapid adaptation to novel start and goal configurations and theoretically show that the resulting controller guarantees iterative improvement in expectation for stochastic nonlinear systems. We present results with a practical instantiation of this algorithm and experimentally demonstrate that the resulting controller adapts to a variety of initial and terminal conditions on 3 stochastic continuous control tasks.

@inproceedings{ABCLMPCWAFR20,
 abstract = {Sample-based learning model predictive control (LMPC) strategies have recently attracted attention due to their desirable theoretical properties and their good empirical performance on robotic tasks. However, prior analysis of LMPC controllers for stochastic systems has mainly focused on linear systems in the iterative learning control setting. We present a novel LMPC algorithm, Adjustable Boundary Condition LMPC (ABC-LMPC), which enables rapid adaptation to novel start and goal configurations and theoretically show that the resulting controller guarantees iterative improvement in expectation for stochastic nonlinear systems. We present results with a practical instantiation of this algorithm and experimentally demonstrate that the resulting controller adapts to a variety of initial and terminal conditions on 3 stochastic continuous control tasks.},
 author = {Brijen Thananjeyan and Ashwin Balakrishna and Ugo Rosolia and Joseph E. Gonzalez and Aaron D. Ames and Ken Goldberg},
 bdsk-url-1 = {https://arxiv.org/abs/2003.01410},
 booktitle = {Proceedings of the Int. Workshop on the Algorithmic Foundations of Robotics (WAFR)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 title = { {ABC-LMPC:} Safe Sample-Based Learning {MPC} for Stochastic Nonlinear Dynamical Systems with Adjustable Boundary Conditions},
 url = {https://arxiv.org/abs/2003.01410},
 year = {2020}
}

Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph E. Gonzalez, and Ion Stoica. "Ansor: Generating High-Performance Tensor Programs for Deep Learning." 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020, Virtual Event, November 4-6, 2020, 2020.

High-performance tensor programs are crucial to guarantee efficient execution of deep neural networks. However, obtaining performant tensor programs for different operators on various hardware platforms is notoriously challenging. Currently, deep learning systems rely on vendor-provided kernel libraries or various search strategies to get performant tensor programs. These approaches either require significant engineering effort to develop platform-specific optimization code or fall short of finding high-performance programs due to restricted search space and ineffective exploration strategy. We present Ansor, a tensor program generation framework for deep learning applications. Compared with existing search strategies, Ansor explores many more optimization combinations by sampling programs from a hierarchical representation of the search space. Ansor then fine-tunes the sampled programs with evolutionary search and a learned cost model to identify the best programs. Ansor can find high-performance programs that are outside the search space of existing state-of-the-art approaches. In addition, Ansor utilizes a task scheduler to simultaneously optimize multiple subgraphs in deep neural networks. We show that Ansor improves the execution performance of deep neural networks relative to the state-of-the-art on the Intel CPU, ARM CPU, and NVIDIA GPU by up to 3.8x, 2.6x, and 1.7x, respectively.

@inproceedings{ZhengJSWYHWYZSG20,
 abstract = {
High-performance tensor programs are crucial to guarantee efficient execution of deep neural networks. However, obtaining performant tensor programs for different operators on various hardware platforms is notoriously challenging. Currently, deep learning systems rely on vendor-provided kernel libraries or various search strategies to get performant tensor programs. These approaches either require significant engineering effort to develop platform-specific optimization code or fall short of finding high-performance programs due to restricted search space and ineffective exploration strategy.

We present Ansor, a tensor program generation framework for deep learning applications. Compared with existing search strategies, Ansor explores many more optimization combinations by sampling programs from a hierarchical representation of the search space. Ansor then fine-tunes the sampled programs with evolutionary search and a learned cost model to identify the best programs. Ansor can find high-performance programs that are outside the search space of existing state-of-the-art approaches. In addition, Ansor utilizes a task scheduler to simultaneously optimize multiple subgraphs in deep neural networks. We show that Ansor improves the execution performance of deep neural networks relative to the state-of-the-art on the Intel CPU, ARM CPU, and NVIDIA GPU by up to 3.8x, 2.6x, and 1.7x, respectively.
},
 author = {Lianmin Zheng and Chengfan Jia and Minmin Sun and Zhao Wu and Cody Hao Yu and Ameer Haj{-}Ali and Yida Wang and Jun Yang and Danyang Zhuo and Koushik Sen and Joseph E. Gonzalez and Ion Stoica},
 bdsk-url-1 = {https://www.usenix.org/conference/osdi20/presentation/zheng},
 booktitle = {14th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI} 2020, Virtual Event, November 4-6, 2020},
 keywords = {peerrev, selected},
 pages = {863--879},
 publisher = { {USENIX} Association},
 title = {Ansor: Generating High-Performance Tensor Programs for Deep Learning},
 url = {https://www.usenix.org/conference/osdi20/presentation/zheng},
 year = {2020}
}

Mong H. Ng, Kaahan Radia, Jianfei Chen, Dequan Wang, Ionel Gog, and Joseph E. Gonzalez. "BEV-Seg: Bird's Eye View Semantic Segmentation Using Geometry and Semantic Point Cloud." Proceedings of the Workshop in Scalability for Autonomous Driving at CVPR'20, 2020.

Bird's-eye-view (BEV) is a powerful and widely adopted representation for road scenes that captures surrounding objects and their spatial locations, along with overall context in the scene. In this work, we focus on bird's eye semantic segmentation, a task that predicts pixel-wise semantic segmentation in BEV from side RGB images. This task is made possible by simulators such as Carla, which allow for cheap data collection, arbitrary camera placements, and supervision in ways otherwise not possible in the real world. There are two main challenges to this task: the view transformation from side view to bird's eye view, as well as transfer learning to unseen domains. Existing work transforms between views through fully connected layers and transfer learns via GANs. This suffers from a lack of depth reasoning and performance degradation across domains. Our novel 2-staged perception pipeline explicitly predicts pixel depths and combines them with pixel semantics in an efficient manner, allowing the model to leverage depth information to infer objects' spatial locations in the BEV. In addition, we transfer learning by abstracting high-level geometric features and predicting an intermediate representation that is common across different domains. We publish a new dataset called BEVSEG-Carla and show that our approach improves state-of-the-art by 24\% mIoU and performs well when transferred to a new domain.

@inproceedings{ng2020bevseg,
 abstract = {Bird's-eye-view (BEV) is a powerful and widely adopted representation for road scenes that captures surrounding objects and their spatial locations, along with overall context in the scene. In this work, we focus on bird's eye semantic segmentation, a task that predicts pixel-wise semantic segmentation in BEV from side RGB images. This task is made possible by simulators such as Carla, which allow for cheap data collection, arbitrary camera placements, and supervision in ways otherwise not possible in the real world. There are two main challenges to this task: the view transformation from side view to bird's eye view, as well as transfer learning to unseen domains. Existing work transforms between views through fully connected layers and transfer learns via GANs. This suffers from a lack of depth reasoning and performance degradation across domains. Our novel 2-staged perception pipeline explicitly predicts pixel depths and combines them with pixel semantics in an efficient manner, allowing the model to leverage depth information to infer objects' spatial locations in the BEV. In addition, we transfer learning by abstracting high-level geometric features and predicting an intermediate representation that is common across different domains. We publish a new dataset called BEVSEG-Carla and show that our approach improves state-of-the-art by 24\% mIoU and performs well when transferred to a new domain.},
 archiveprefix = {arXiv},
 author = {Mong H. Ng and Kaahan Radia and Jianfei Chen and Dequan Wang and Ionel Gog and Joseph E. Gonzalez},
 bdsk-url-1 = {https://arxiv.org/abs/2006.11436},
 booktitle = {Proceedings of the Workshop in Scalability for Autonomous Driving at {CVPR}'20},
 eprint = {2006.11436},
 keywords = {peerrev},
 title = { {BEV-Seg}: Bird's Eye View Semantic Segmentation Using Geometry and Semantic Point Cloud},
 url = {https://arxiv.org/abs/2006.11436},
 year = {2020}
}

Tianjun Zhang, Huazhe Xu, Xiaolong Wang, Yi Wu, Kurt Keutzer, Joseph E. Gonzalez, and Yuandong Tian. "BeBold: Exploration Beyond the Boundary of Explored Regions." arXiv, 2020.

Efficient exploration under sparse rewards remains a key challenge in deep reinforcement learning. To guide exploration, previous work makes extensive use of intrinsic reward (IR). There are many heuristics for IR, including visitation counts, curiosity, and state-difference. In this paper, we analyze the pros and cons of each method and propose the regulated difference of inverse visitation counts as a simple but effective criterion for IR. The criterion helps the agent explore Beyond the Boundary of explored regions and mitigates common issues in count-based methods, such as short-sightedness and detachment. The resulting method, BeBold, solves the 12 most challenging procedurally-generated tasks in MiniGrid with just 120M environment steps, without any curriculum learning. In comparison, the previous SoTA only solves $50\%$ of the tasks. BeBold also achieves SoTA on multiple tasks in NetHack, a popular rogue-like game that contains more challenging procedurally-generated environments.

@misc{Zhang21c,
 abstract = {Efficient exploration under sparse rewards remains a key challenge in deep reinforcement learning. To guide exploration, previous work makes extensive use of intrinsic reward (IR). There are many heuristics for IR, including visitation counts, curiosity, and state-difference. In this paper, we analyze the pros and cons of each method and propose the regulated difference of inverse visitation counts as a simple but effective criterion for IR. The criterion helps the agent explore Beyond the Boundary of explored regions and mitigates common issues in count-based methods, such as short-sightedness and detachment. The resulting method, BeBold, solves the 12 most challenging procedurally-generated tasks in MiniGrid with just 120M environment steps, without any curriculum learning. In comparison, the previous SoTA only solves $50\%$ of the tasks. BeBold also achieves SoTA on multiple tasks in NetHack, a popular rogue-like game that contains more challenging procedurally-generated environments.},
 author = {Zhang, Tianjun and Xu, Huazhe and Wang, Xiaolong and Wu, Yi and Keutzer, Kurt and Gonzalez, Joseph E. and Tian, Yuandong},
 bdsk-url-1 = {https://arxiv.org/abs/2012.08621},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2012.08621},
 copyright = {arXiv.org perpetual, non-exclusive license},
 doi = {10.48550/ARXIV.2012.08621},
 keywords = {arxivpre, Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences},
 publisher = {arXiv},
 title = {BeBold: Exploration Beyond the Boundary of Explored Regions},
 url = {https://arxiv.org/abs/2012.08621},
 year = {2020}
}

Yaoqing Yang, Rajiv Khanna, Yaodong Yu, Amir Gholami, Kurt Keutzer, Joseph E. Gonzalez, Kannan Ramchandran, and Michael W. Mahoney. "Boundary thickness and robustness in learning models." Advances in Neural Information Processing Systems, 2020.

Robustness of machine learning models to various adversarial and non-adversarial corruptions continues to be of interest. In this paper, we introduce the notion of the boundary thickness of a classifier, and we describe its connection with and usefulness for model robustness. Thick decision boundaries lead to improved performance, while thin decision boundaries lead to overfitting (e.g., measured by the robust generalization gap between training and testing) and lower robustness. We show that a thicker boundary helps improve robustness against adversarial examples (e.g., improving the robust test accuracy of adversarial training), as well as so-called out-of-distribution (OOD) transforms, and we show that many commonly-used regularization and data augmentation procedures can increase boundary thickness. On the theoretical side, we establish that maximizing boundary thickness is akin to minimizing the so-called mixup loss. Using these observations, we can show that noise-augmentation on mixup training further increases boundary thickness, thereby combating vulnerability to various forms of adversarial attacks and OOD transforms. We can also show that the performance improvement in several recent lines of work happens in conjunction with a thicker boundary.

@inproceedings{YangKYGKGRM20,
 abstract = {Robustness of machine learning models to various adversarial and non-adversarial corruptions continues to be of interest. In this paper, we introduce the notion of the boundary thickness of a classifier, and we describe its connection with and usefulness for model robustness. Thick decision boundaries lead to improved performance, while thin decision boundaries lead to overfitting (e.g., measured by the robust generalization gap between training and testing) and lower robustness. We show that a thicker boundary helps improve robustness against adversarial examples (e.g., improving the robust test accuracy of adversarial training), as well as so-called out-of-distribution (OOD) transforms, and we show that many commonly-used regularization and data augmentation procedures can increase boundary thickness. On the theoretical side, we establish that maximizing boundary thickness is akin to minimizing the so-called mixup loss. Using these observations, we can show that noise-augmentation on mixup training further increases boundary thickness, thereby combating vulnerability to various forms of adversarial attacks and OOD transforms. We can also show that the performance improvement in several recent lines of work happens in conjunction with a thicker boundary.},
 author = {Yaoqing Yang and Rajiv Khanna and Yaodong Yu and Amir Gholami and Kurt Keutzer and Joseph E. Gonzalez and Kannan Ramchandran and Michael W. Mahoney},
 bdsk-url-1 = {https://proceedings.neurips.cc/paper/2020/file/44e76e99b5e194377e955b13fb12f630-Paper.pdf},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
 keywords = {peerrev, selected},
 pages = {6223--6234},
 publisher = {Curran Associates, Inc.},
 title = {Boundary thickness and robustness in learning models},
 url = {https://proceedings.neurips.cc/paper/2020/file/44e76e99b5e194377e955b13fb12f630-Paper.pdf},
 volume = {33},
 year = {2020}
}

Paras Jain, Ajay Jain, Aniruddha Nrusimha, Amir Gholami, Pieter Abbeel, Kurt Keutzer, Ion Stoica, and Joseph E. Gonzalez. "Breaking the Memory Wall with Optimal Tensor Rematerialization." Proceedings of Machine Learning and Systems, 2020.

Modern neural networks are increasingly bottlenecked by the limited capacity of on-device GPU memory. Prior work explores dropping activations as a strategy to scale to larger neural networks with fixed memory. However, these heuristics assume uniform cost per layer and only consider simple linear chain architectures, limiting their usability. In this paper, we formalize the problem of trading-off computation time and memory requirements for DNN training as the tensor rematerialization optimization problem. We develop a new system to optimally solve the problem in reasonable times (under an hour) using off-the-shelf MILP solvers. These schedules subsequently accelerate millions of training iterations. Our optimization pass in TensorFlow 2.0 automatically yields real training speedups of up to 4.8x over prior work, and can enable up to 5x increase in input size for real-world large networks.

@inproceedings{Checkmate20,
 abstract = {Modern neural networks are increasingly bottlenecked by the limited capacity of on-device GPU memory. Prior work explores dropping activations as a strategy to scale to larger neural networks with fixed memory. However, these heuristics assume uniform cost per layer and only consider simple linear chain architectures, limiting their usability. In this paper, we formalize the problem of trading-off computation time and memory requirements for DNN training as the tensor rematerialization optimization problem. We develop a new system to optimally solve the problem in reasonable times (under an hour) using off-the-shelf MILP solvers. These schedules subsequently accelerate millions of training iterations. Our optimization pass in TensorFlow 2.0 automatically yields real training speedups of up to 4.8x over prior work, and can enable up to 5x increase in input size for real-world large networks.},
 author = {Paras Jain and Ajay Jain and Aniruddha Nrusimha and Amir Gholami and Pieter Abbeel and Kurt Keutzer and Ion Stoica and Joseph E. Gonzalez},
 bdsk-url-1 = {https://arxiv.org/abs/1910.02653},
 booktitle = {Proceedings of Machine Learning and Systems},
 code = {https://github.com/parasj/checkmate},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev, selected},
 pages = {497--511},
 title = {Breaking the Memory Wall with Optimal Tensor Rematerialization},
 url = {https://arxiv.org/abs/1910.02653},
 year = {2020}
}

Vikram Sreekanti, Chenggang Wu, Xiayue Charles Lin, Johann Schleier-Smith, Jose M. Faleiro, Joseph E. Gonzalez, Joseph M. Hellerstein, and Alexey Tumanov. "Cloudburst: Stateful Functions-as-a-Service." Proceedings of Very Large Data Bases (PVLDB), 2020.

Function-as-a-Service (FaaS) platforms and ``serverless'' cloud computing are becoming increasingly popular. Current FaaS offerings are targeted at stateless functions that do minimal I/O and communication. We argue that the benefits of serverless computing can be extended to a broader range of applications and algorithms. We present the design and implementation of Cloudburst, a stateful FaaS platform that provides familiar Python programming with low-latency mutable state and communication, while maintaining the autoscaling benefits of serverless computing. Cloudburst accomplishes this by leveraging Anna, an autoscaling key-value store, for state sharing and overlay routing combined with mutable caches co-located with function executors for data locality. Performant cache consistency emerges as a key challenge in this architecture. To this end, Cloudburst provides a combination of lattice-encapsulated state and new definitions and protocols for distributed session consistency. Empirical results on benchmarks and diverse applications show that Cloudburst makes stateful functions practical, reducing the state-management overheads of current FaaS platforms by orders of magnitude while also improving the state of the art in serverless consistency.

@inproceedings{Cloudburst20,
 abstract = {Function-as-a-Service (FaaS) platforms and ``serverless'' cloud computing are becoming increasingly popular. Current FaaS offerings are targeted at stateless functions that do minimal I/O and communication. We argue that the benefits of serverless computing can be extended to a broader range of applications and algorithms. We present the design and implementation of Cloudburst, a stateful FaaS platform that provides familiar Python programming with low-latency mutable state and communication, while maintaining the autoscaling benefits of serverless computing. Cloudburst accomplishes this by leveraging Anna, an autoscaling key-value store, for state sharing and overlay routing combined with mutable caches co-located with function executors for data locality. Performant cache consistency emerges as a key challenge in this architecture. To this end, Cloudburst provides a combination of lattice-encapsulated state and new definitions and protocols for distributed session consistency. Empirical results on benchmarks and diverse applications show that Cloudburst makes stateful functions practical, reducing the state-management overheads of current FaaS platforms by orders of magnitude while also improving the state of the art in serverless consistency.},
 author = {Vikram Sreekanti and Chenggang Wu and Xiayue Charles Lin and Johann Schleier{-}Smith and Jose M. Faleiro and Joseph E. Gonzalez and Joseph M. Hellerstein and Alexey Tumanov},
 bdsk-url-1 = {http://www.vldb.org/pvldb/vol13/p2438-sreekanti.pdf},
 booktitle = {Proceedings of Very Large Data Bases (PVLDB)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev, selected},
 title = {Cloudburst: Stateful Functions-as-a-Service},
 url = {http://www.vldb.org/pvldb/vol13/p2438-sreekanti.pdf},
 volume = {13},
 year = {2020}
}

David E. Culler, Prabal Dutta, Gabe Fierro, Joseph E. Gonzalez, Nathan Pemberton, Johann Schleier-Smith, Kalyanaraman Shankari, Alvin Wan, and Thomas Zachariah. "CoVista: A Unified View on Privacy Sensitive Mobile Contact Tracing." IEEE Data Eng. Bull., 2020.

Governments around the world have become increasingly frustrated with tech giants dictating public health policy. The software created by Apple and Google enables individuals to track their own potential exposure through collated exposure notifications. However, the same software prohibits location tracking, denying key information needed by public health officials for robust contract tracing. This information is needed to treat and isolate COVID-19 positive people, identify transmission hotspots, and protect against continued spread of infection. In this article, we present two simple ideas: the lighthouse and the covid-commons that address the needs of public health authorities while preserving the privacy-sensitive goals of the Apple and google exposure notification protocols.

@article{Covid,
 abstract = {Governments around the world have become increasingly frustrated with tech giants dictating public health policy. The software created by Apple and Google enables individuals to track their own potential exposure through collated exposure notifications. However, the same software prohibits location tracking, denying key information needed by public health officials for robust contract tracing. This information is needed to treat and isolate COVID-19 positive people, identify transmission hotspots, and protect against continued spread of infection. In this article, we present two simple ideas: the lighthouse and the covid-commons that address the needs of public health authorities while preserving the privacy-sensitive goals of the Apple and google exposure notification protocols.},
 author = {David E. Culler and Prabal Dutta and Gabe Fierro and Joseph E. Gonzalez and Nathan Pemberton and Johann Schleier{-}Smith and Kalyanaraman Shankari and Alvin Wan and Thomas Zachariah},
 bdsk-url-1 = {http://sites.computer.org/debull/A20june/p83.pdf},
 date-modified = {2020-08-02 11:27:35 -0700},
 journal = { {IEEE} Data Eng. Bull.},
 keywords = {techreport},
 number = {2},
 pages = {83--94},
 title = {CoVista: {A} Unified View on Privacy Sensitive Mobile Contact Tracing},
 url = {http://sites.computer.org/debull/A20june/p83.pdf},
 volume = {43},
 year = {2020}
}

Paras Jain, Ajay Jain, Tianjun Zhang, Pieter Abbeel, Joseph E. Gonzalez, and Ion Stoica. "Contrastive Code Representation Learning." CoRR (arXiv), 2020.

Machine-aided programming tools such as type predictors and code summarizers are increasingly learning-based. However, most code representation learning approaches rely on supervised learning with task-specific annotated datasets. We propose Contrastive Code Representation Learning (ContraCode), a self-supervised algorithm for learning task-agnostic semantic representations of programs via contrastive learning. Our approach uses no human-provided labels, relying only on the raw text of programs. In particular, we design an unsupervised pretext task by generating textually divergent copies of source functions via automated source-to-source compiler transforms that preserve semantics. We train a neural model to identify variants of an anchor program within a large batch of negatives. To solve this task, the network must extract program features representing the functionality, not form, of the program. This is the first application of instance discrimination to code representation learning to our knowledge. We pre-train models over 1.8m unannotated JavaScript methods mined from GitHub. ContraCode pre-training improves code summarization accuracy by 7.9\% over supervised approaches and 4.8\% over RoBERTa pre-training. Moreover, our approach is agnostic to model architecture; for a type inference task, contrastive pre-training consistently improves the accuracy of existing baselines.

@article{jain2020contrastive,
 abstract = {Machine-aided programming tools such as type predictors and code summarizers are increasingly learning-based. However, most code representation learning approaches rely on supervised learning with task-specific annotated datasets. We propose Contrastive Code Representation Learning (ContraCode), a self-supervised algorithm for learning task-agnostic semantic representations of programs via contrastive learning. Our approach uses no human-provided labels, relying only on the raw text of programs. In particular, we design an unsupervised pretext task by generating textually divergent copies of source functions via automated source-to-source compiler transforms that preserve semantics. We train a neural model to identify variants of an anchor program within a large batch of negatives. To solve this task, the network must extract program features representing the functionality, not form, of the program. This is the first application of instance discrimination to code representation learning to our knowledge. We pre-train models over 1.8m unannotated JavaScript methods mined from GitHub. ContraCode pre-training improves code summarization accuracy by 7.9\% over supervised approaches and 4.8\% over RoBERTa pre-training. Moreover, our approach is agnostic to model architecture; for a type inference task, contrastive pre-training consistently improves the accuracy of existing baselines.},
 archiveprefix = {arXiv},
 author = {Paras Jain and Ajay Jain and Tianjun Zhang and Pieter Abbeel and Joseph E. Gonzalez and Ion Stoica},
 bdsk-url-1 = {https://arxiv.org/abs/2007.04973},
 code = {https://parasj.github.io/contracode/},
 date-modified = {2020-08-02 11:27:35 -0700},
 eprint = {2007.04973},
 journal = {CoRR},
 keywords = {arxivpre},
 primaryclass = {cs.LG},
 title = {Contrastive Code Representation Learning},
 url = {https://arxiv.org/abs/2007.04973},
 year = {2020}
}

Xiaoliang Dai, Alvin Wan, Peizhao Zhang, Bichen Wu, Zijian He, Zhen Wei, Kan Chen, Yuandong Tian, Matthew Yu, Peter Vajda, and Joseph E. Gonzalez. "FBNetV3: Joint Architecture-Recipe Search using Neural Acquisition Function." CoRR (arXiv), 2020.

Neural Architecture Search (NAS) yields state-of-the-art neural networks that outperform their best manually-designed counterparts. However, previous NAS methods search for architectures under one training recipe (i.e., training hyperparameters), ignoring the significance of training recipes and overlooking superior architectures under other training recipes. Thus, they fail to find higher-accuracy architecture-recipe combinations. To address this oversight, we present JointNAS to search both (a) architectures and (b) their corresponding training recipes. To accomplish this, we introduce a neural acquisition function that scores architectures and training recipes jointly. Following pre-training on a proxy dataset, this acquisition function guides both coarse-grained and fine-grained searches to produce FBNetV3. FBNetV3 is a family of state-of-the-art compact ImageNet models, outperforming both automatically and manually-designed architectures. For example, FBNetV3 matches both EfficientNet and ResNeSt accuracy with 1.4x and 5.0x fewer FLOPs, respectively. Furthermore, the JointNAS-searched training recipe yields significant performance gains across different networks and tasks.

@article{FBNetV3,
 abstract = {Neural Architecture Search (NAS) yields state-of-the-art neural networks that outperform their best manually-designed counterparts. However, previous NAS methods search for architectures under one training recipe (i.e., training hyperparameters), ignoring the significance of training recipes and overlooking superior architectures under other training recipes. Thus, they fail to find higher-accuracy architecture-recipe combinations. To address this oversight, we present JointNAS to search both (a) architectures and (b) their corresponding training recipes. To accomplish this, we introduce a neural acquisition function that scores architectures and training recipes jointly. Following pre-training on a proxy dataset, this acquisition function guides both coarse-grained and fine-grained searches to produce FBNetV3. FBNetV3 is a family of state-of-the-art compact ImageNet models, outperforming both automatically and manually-designed architectures. For example, FBNetV3 matches both EfficientNet and ResNeSt accuracy with 1.4x and 5.0x fewer FLOPs, respectively. Furthermore, the JointNAS-searched training recipe yields significant performance gains across different networks and tasks.},
 archiveprefix = {arXiv},
 author = {Xiaoliang Dai and Alvin Wan and Peizhao Zhang and Bichen Wu and Zijian He and Zhen Wei and Kan Chen and Yuandong Tian and Matthew Yu and Peter Vajda and Joseph E. Gonzalez},
 bdsk-url-1 = {https://arxiv.org/abs/2006.02049},
 date-modified = {2020-08-02 11:27:35 -0700},
 eprint = {2006.02049},
 journal = {CoRR},
 keywords = {arxivpre},
 title = { {FBNetV3}: Joint Architecture-Recipe Search using Neural Acquisition Function},
 url = {https://arxiv.org/abs/2006.02049},
 volume = {abs/2006.02049},
 year = {2020}
}

Jeffrey Ichnowski, William Lee, Victor Murta, Samuel Paradis, Ron Alterovitz, Joseph E. Gonzalez, Ion Stoica, and Ken Goldberg. "Fog Robotics Algorithms for Distributed Motion Planning Using Lambda Serverless Computing." 2020 IEEE International Conference on Robotics and Automation, ICRA 2020, Paris, France, May 31 - August 31, 2020, 2020.

For robots using motion planning algorithms such as {RRT} and {RRT\textasteriskcentered}, the computational load can vary by orders of magnitude as the complexity of the local environment changes. To adaptively provide such computation, we propose Fog Robotics algorithms in which cloud-based serverless lambda computing provides parallel computation on demand. To use this parallelism, we propose novel motion planning algorithms that scale effectively with an increasing number of serverless computers. However, given that the allocation of computing is typically bounded by both monetary and time constraints, we show how prior learning can be used to efficiently allocate resources at runtime. We demonstrate the algorithms and application of learned parallel allocation in both simulation and with the Fetch commercial mobile manipulator using Amazon Lambda to complete a sequence of sporadically computationally intensive motion planning tasks.

@inproceedings{IchnowskiLMPAGS20,
 abstract = {For robots using motion planning algorithms such as {RRT} and {RRT\textasteriskcentered}, the computational load can vary by orders of magnitude as the complexity of the local environment changes. To adaptively provide such computation, we propose Fog Robotics algorithms in which cloud-based serverless lambda computing provides parallel computation on demand. To use this parallelism, we propose novel motion planning algorithms that scale effectively with an increasing number of serverless computers. However, given that the allocation of computing is typically bounded by both monetary and time constraints, we show how prior learning can be used to efficiently allocate resources at runtime. We demonstrate the algorithms and application of learned parallel allocation in both simulation and with the Fetch commercial mobile manipulator using Amazon Lambda to complete a sequence of sporadically computationally intensive motion planning tasks.},
 author = {Jeffrey Ichnowski and William Lee and Victor Murta and Samuel Paradis and Ron Alterovitz and Joseph E. Gonzalez and Ion Stoica and Ken Goldberg},
 bdsk-url-1 = {https://doi.org/10.1109/ICRA40945.2020.9196651},
 booktitle = {2020 {IEEE} International Conference on Robotics and Automation, {ICRA} 2020, Paris, France, May 31 - August 31, 2020},
 doi = {10.1109/ICRA40945.2020.9196651},
 keywords = {peerrev, selected},
 pages = {4232--4238},
 publisher = { {IEEE} },
 title = {Fog Robotics Algorithms for Distributed Motion Planning Using Lambda Serverless Computing},
 url = {https://doi.org/10.1109/ICRA40945.2020.9196651},
 year = {2020}
}

Priya Sundaresan, Jennifer Grannen, Brijen Thananjeyan, Ashwin Balakrishna, Michael Laskey, Kevin Stone, Joseph E. Gonzalez, and Ken Goldberg. "Learning Rope Manipulation Policies Using Dense Object Descriptors Trained on Synthetic Depth Data." 2020 IEEE International Conference on Robotics and Automation, ICRA 2020, Paris, France, May 31 - August 31, 2020, 2020.

Robotic manipulation of deformable 1D objects such as ropes, cables, and hoses is challenging due to the lack of high-fidelity analytic models and large configuration spaces. Furthermore, learning end-to-end manipulation policies directly from images and physical interaction requires significant time on a robot and can fail to generalize across tasks. We address these challenges using interpretable deep visual representations for rope, extending recent work on dense object descriptors for robot manipulation. This facilitates the design of interpretable and transferable geometric policies built on top of the learned representations, decoupling visual reasoning and control. We present an approach that learns point-pair correspondences between initial and goal rope configurations, which implicitly encodes geometric structure, entirely in simulation from synthetic depth images. We demonstrate that the learned representation - dense depth object descriptors (DDODs) - can be used to manipulate a real rope into a variety of different arrangements either by learning from demonstrations or using interpretable geometric policies. In 50 trials of a knot-tying task with the ABB YuMi Robot, the system achieves a 66\% knot-tying success rate from previously unseen configurations.

@inproceedings{SundaresanGTBLS20,
 abstract = {Robotic manipulation of deformable 1D objects such as ropes, cables, and hoses is challenging due to the lack of high-fidelity analytic models and large configuration spaces. Furthermore, learning end-to-end manipulation policies directly from images and physical interaction requires significant time on a robot and can fail to generalize across tasks. We address these challenges using interpretable deep visual representations for rope, extending recent work on dense object descriptors for robot manipulation. This facilitates the design of interpretable and transferable geometric policies built on top of the learned representations, decoupling visual reasoning and control. We present an approach that learns point-pair correspondences between initial and goal rope configurations, which implicitly encodes geometric structure, entirely in simulation from synthetic depth images. We demonstrate that the learned representation - dense depth object descriptors (DDODs) - can be used to manipulate a real rope into a variety of different arrangements either by learning from demonstrations or using interpretable geometric policies. In 50 trials of a knot-tying task with the ABB YuMi Robot, the system achieves a 66\% knot-tying success rate from previously unseen configurations.},
 author = {Priya Sundaresan and Jennifer Grannen and Brijen Thananjeyan and Ashwin Balakrishna and Michael Laskey and Kevin Stone and Joseph E. Gonzalez and Ken Goldberg},
 bdsk-url-1 = {https://doi.org/10.1109/ICRA40945.2020.9197121},
 booktitle = {2020 {IEEE} International Conference on Robotics and Automation, {ICRA} 2020, Paris, France, May 31 - August 31, 2020},
 doi = {10.1109/ICRA40945.2020.9197121},
 keywords = {peerrev},
 pages = {9411--9418},
 publisher = { {IEEE} },
 title = {Learning Rope Manipulation Policies Using Dense Object Descriptors Trained on Synthetic Depth Data},
 url = {https://doi.org/10.1109/ICRA40945.2020.9197121},
 year = {2020}
}

Kirthevasan Kandasamy, Joseph E. Gonzalez, Michael I. Jordan, and Ion Stoica. "Mechanism Design with Bandit Feedback." CoRR (arXiv), 2020.

We study a multi-round welfare-maximising mechanism design problem in instances where agents do not know their values. On each round, a mechanism assigns an allocation each to a set of agents and charges them a price; then the agents provide (stochastic) feedback to the mechanism for the allocation they received. This is motivated by applications in cloud markets and online advertising where an agent may know her value for an allocation only after experiencing it. Therefore, the mechanism needs to explore different allocations for each agent, while simultaneously attempting to find the socially optimal set of allocations. Our focus is on truthful and individually rational mechanisms which imitate the classical VCG mechanism in the long run. To that end, we define three notions of regret for the welfare, the individual utilities of each agent and that of the mechanism. We show that these three terms are interdependent via an \$\Omega(T^{2/3})\$ lower bound for the maximum of these three terms after T rounds of allocations, and describe a family of anytime algorithms which achieve this rate. Our framework provides flexibility to control the pricing scheme so as to trade-off between the agent and seller regrets, and additionally to control the degree of truthfulness and individual rationality.

@article{Kirthevasan20_mechanism_design,
 abstract = {We study a multi-round welfare-maximising mechanism design problem in instances where agents do not know their values. On each round, a mechanism assigns an allocation each to a set of agents and charges them a price; then the agents provide (stochastic) feedback to the mechanism for the allocation they received. This is motivated by applications in cloud markets and online advertising where an agent may know her value for an allocation only after experiencing it. Therefore, the mechanism needs to explore different allocations for each agent, while simultaneously attempting to find the socially optimal set of allocations. Our focus is on truthful and individually rational mechanisms which imitate the classical VCG mechanism in the long run. To that end, we define three notions of regret for the welfare, the individual utilities of each agent and that of the mechanism. We show that these three terms are interdependent via an \$\Omega(T^{2/3})\$ lower bound for the maximum of these three terms after T rounds of allocations, and describe a family of anytime algorithms which achieve this rate. Our framework provides flexibility to control the pricing scheme so as to trade-off between the agent and seller regrets, and additionally to control the degree of truthfulness and individual rationality.},
 archiveprefix = {arXiv},
 author = {Kirthevasan Kandasamy and Joseph E. Gonzalez and Michael I. Jordan and Ion Stoica},
 bdsk-url-1 = {https://arxiv.org/abs/2004.08924},
 date-modified = {2020-08-02 11:27:35 -0700},
 eprint = {2004.08924},
 journal = {CoRR},
 keywords = {arxivpre},
 title = {Mechanism Design with Bandit Feedback},
 url = {https://arxiv.org/abs/2004.08924},
 volume = {abs/2004.08924},
 year = {2020}
}

Ankur Dave, Chester Leung, Raluca Ada Popa, Joseph E. Gonzalez, and Ion Stoica. "Oblivious Coopetitive Analytics Using Hardware Enclaves." Proceedings of the Fifteenth European Conference on Computer Systems (EuroSys), 2020.

Coopetitive analytics refers to cooperation among competing parties to run queries over their joint data. Regulatory, business, and liability concerns prevent these organizations from sharing their sensitive data in plaintext. We propose Oblivious Coopetitive Queries (OCQ), an efficient, general framework for oblivious coopetitive analytics using hardware enclaves. OCQ builds on Opaque, a Spark-based framework for secure distributed analytics, to execute coopetitive queries using hardware enclaves in a decentralized manner. Its query planner chooses how and where to execute each relational operator to prevent data leakage through side channels such as memory access patterns, network traffic statistics, and cardinality, while minimizing overhead. We implemented OCQ as an extension to Apache Spark SQL. We find that OCQ is up to 9.9x faster than Opaque, a state-of-the-art secure analytics framework which outsources all data and computation to an enclave-enabled cloud; and is up to 219x faster than implementing analytics using AgMPC, a state-of-the-art secure multi-party computation framework.

@inproceedings{OCQ20,
 abstract = {
Coopetitive analytics refers to cooperation among competing parties to run queries over their joint data. Regulatory, business, and liability concerns prevent these organizations from sharing their sensitive data in plaintext.

We propose Oblivious Coopetitive Queries (OCQ), an efficient, general framework for oblivious coopetitive analytics using hardware enclaves. OCQ builds on Opaque, a Spark-based framework for secure distributed analytics, to execute coopetitive queries using hardware enclaves in a decentralized manner. Its query planner chooses how and where to execute each relational operator to prevent data leakage through side channels such as memory access patterns, network traffic statistics, and cardinality, while minimizing overhead.

We implemented OCQ as an extension to Apache Spark SQL. We find that OCQ is up to 9.9x faster than Opaque, a state-of-the-art secure analytics framework which outsources all data and computation to an enclave-enabled cloud; and is up to 219x faster than implementing analytics using AgMPC, a state-of-the-art secure multi-party computation framework.
},
 address = {New York, NY, USA},
 articleno = {39},
 author = {Ankur Dave and Chester Leung and Raluca Ada Popa and Joseph E. Gonzalez and Ion Stoica},
 bdsk-url-1 = {https://doi.org/10.1145/3342195.3387552},
 booktitle = {Proceedings of the Fifteenth European Conference on Computer Systems (EuroSys)},
 date-modified = {2020-08-02 11:27:35 -0700},
 isbn = {9781450368827},
 keywords = {peerrev},
 location = {Heraklion, Greece},
 numpages = {17},
 publisher = {Association for Computing Machinery},
 series = {EuroSys '20},
 title = {Oblivious Coopetitive Analytics Using Hardware Enclaves},
 url = {https://doi.org/10.1145/3342195.3387552},
 year = {2020}
}

Kirthevasan Kandasamy, Gur-Eyal Sela, Joseph E Gonzalez, Michael I Jordan, and Ion Stoica. "Online Learning Demands in Max-min Fairness." arXiv, 2020.

We describe mechanisms for the allocation of a scarce resource among multiple users in a way that is efficient, fair, and strategy-proof, but when users do not know their resource requirements. The mechanism is repeated for multiple rounds and a user's requirements can change on each round. At the end of each round, users provide feedback about the allocation they received, enabling the mechanism to learn user preferences over time. Such situations are common in the shared usage of a compute cluster among many users in an organisation, where all teams may not precisely know the amount of resources needed to execute their jobs. By understating their requirements, users will receive less than they need and consequently not achieve their goals. By overstating them, they may siphon away precious resources that could be useful to others in the organisation. We formalise this task of online learning in fair division via notions of efficiency, fairness, and strategy-proofness applicable to this setting, and study this problem under three types of feedback: when the users' observations are deterministic, when they are stochastic and follow a parametric model, and when they are stochastic and nonparametric. We derive mechanisms inspired by the classical max-min fairness procedure that achieve these requisites, and quantify the extent to which they are achieved via asymptotic rates. We corroborate these insights with an experimental evaluation on synthetic problems and a web-serving task.

@misc{Kandasamy21,
 abstract = {We describe mechanisms for the allocation of a scarce resource among multiple users in a way that is efficient, fair, and strategy-proof, but when users do not know their resource requirements. The mechanism is repeated for multiple rounds and a user's requirements can change on each round. At the end of each round, users provide feedback about the allocation they received, enabling the mechanism to learn user preferences over time. Such situations are common in the shared usage of a compute cluster among many users in an organisation, where all teams may not precisely know the amount of resources needed to execute their jobs. By understating their requirements, users will receive less than they need and consequently not achieve their goals. By overstating them, they may siphon away precious resources that could be useful to others in the organisation. We formalise this task of online learning in fair division via notions of efficiency, fairness, and strategy-proofness applicable to this setting, and study this problem under three types of feedback: when the users' observations are deterministic, when they are stochastic and follow a parametric model, and when they are stochastic and nonparametric. We derive mechanisms inspired by the classical max-min fairness procedure that achieve these requisites, and quantify the extent to which they are achieved via asymptotic rates. We corroborate these insights with an experimental evaluation on synthetic problems and a web-serving task.},
 author = {Kandasamy, Kirthevasan and Sela, Gur-Eyal and Gonzalez, Joseph E and Jordan, Michael I and Stoica, Ion},
 bdsk-url-1 = {https://arxiv.org/abs/2012.08648},
 bdsk-url-2 = {https://doi.org/10.48550/ARXIV.2012.08648},
 copyright = {arXiv.org perpetual, non-exclusive license},
 doi = {10.48550/ARXIV.2012.08648},
 keywords = {Machine Learning (stat.ML), Artificial Intelligence (cs.AI), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
 publisher = {arXiv},
 title = {Online Learning Demands in Max-min Fairness},
 url = {https://arxiv.org/abs/2012.08648},
 year = {2020}
}

Vikram Sreekanti, Harikaran Subbaraj, Chenggang Wu, Joseph E. Gonzalez, and Joseph M. Hellerstein. "Optimizing Prediction Serving on Low-Latency Serverless Dataflow." CoRR (arXiv), 2020.

Prediction serving systems are designed to provide large volumes of low-latency inferences machine learning models. These systems mix data processing and computationally intensive model inference and benefit from multiple heterogeneous processors and distributed computing resources. In this paper, we argue that a familiar dataflow API is well-suited to this latency-sensitive task, and amenable to optimization even with unmodified black-box ML models. We present the design of Cloudflow, a system that provides this API and realizes it on an autoscaling serverless backend. Cloudflow transparently implements performance-critical optimizations including operator fusion and competitive execution. Our evaluation shows that Cloudflow's optimizations yield significant performance improvements on synthetic workloads and that Cloudflow outperforms state-of-the-art prediction serving systems by as much as 2x on real-world prediction pipelines, meeting latency goals of demanding applications like real-time video analysis.

@article{Cloudflow20,
 abstract = {Prediction serving systems are designed to provide large volumes of low-latency inferences machine learning models. These systems mix data processing and computationally intensive model inference and benefit from multiple heterogeneous processors and distributed computing resources. In this paper, we argue that a familiar dataflow API is well-suited to this latency-sensitive task, and amenable to optimization even with unmodified black-box ML models. We present the design of Cloudflow, a system that provides this API and realizes it on an autoscaling serverless backend. Cloudflow transparently implements performance-critical optimizations including operator fusion and competitive execution. Our evaluation shows that Cloudflow's optimizations yield significant performance improvements on synthetic workloads and that Cloudflow outperforms state-of-the-art prediction serving systems by as much as 2x on real-world prediction pipelines, meeting latency goals of demanding applications like real-time video analysis.},
 archiveprefix = {arXiv},
 author = {Vikram Sreekanti and Harikaran Subbaraj and Chenggang Wu and Joseph E. Gonzalez and Joseph M. Hellerstein},
 bdsk-url-1 = {https://arxiv.org/abs/2007.05832},
 date-modified = {2020-08-02 11:27:35 -0700},
 eprint = {2007.05832},
 journal = {CoRR},
 keywords = {arxivpre},
 title = {Optimizing Prediction Serving on Low-Latency Serverless Dataflow},
 url = {https://arxiv.org/abs/2007.05832},
 volume = {abs/2007.05832},
 year = {2020}
}

Ajay Kumar Tanwani, Raghav Anand, Joseph E. Gonzalez, and Ken Goldberg. "RILaaS: Robot Inference and Learning as a Service." IEEE Robotics and Automation Letters, 2020.

Programming robots is complicated due to the lack of `plug-and-play' modules for skill acquisition. Virtualizing deployment of deep learning models can facilitate large-scale use/re-use of off-the-shelf functional behaviors. Deploying deep learning models on robots entails real-time, accurate and reliable inference service under varying query load. This letter introduces a novel Robot-Inference-and-Learning-as-a-Service (RILaaS) platform for low-latency and secure inference serving of deep models that can be deployed on robots. Unique features of RILaaS include: 1) low-latency and reliable serving with gRPC under dynamic loads by distributing queries over multiple servers on Edge and Cloud, 2) SSH based authentication coupled with SSL/TLS based encryption for security and privacy of the data, and 3) front-end REST API for sharing, monitoring and visualizing performance metrics of the available models. We report experiments to evaluate the RILaaS platform under varying loads of batch size, number of robots, and various model placement hosts on Cloud, Edge, and Fog for providing benchmark applications of object recognition and grasp planning as a service. We address the complexity of load balancing with a reinforcement learning algorithm that optimizes simulated profiles of networked robots; outperforming several baselines including round robin, least connections, and least model time with 68.30\% and 14.04\% decrease in round-trip latency time across models compared to the worst and the next best baseline respectively. Details and updates are available at: \url{https://sites.google.com/view/rilaas.}

@article{Tanwani20,
 abstract = {Programming robots is complicated due to the lack of `plug-and-play' modules for skill acquisition. Virtualizing deployment of deep learning models can facilitate large-scale use/re-use of off-the-shelf functional behaviors. Deploying deep learning models on robots entails real-time, accurate and reliable inference service under varying query load. This letter introduces a novel Robot-Inference-and-Learning-as-a-Service (RILaaS) platform for low-latency and secure inference serving of deep models that can be deployed on robots. Unique features of RILaaS include: 1) low-latency and reliable serving with gRPC under dynamic loads by distributing queries over multiple servers on Edge and Cloud, 2) SSH based authentication coupled with SSL/TLS based encryption for security and privacy of the data, and 3) front-end REST API for sharing, monitoring and visualizing performance metrics of the available models. We report experiments to evaluate the RILaaS platform under varying loads of batch size, number of robots, and various model placement hosts on Cloud, Edge, and Fog for providing benchmark applications of object recognition and grasp planning as a service. We address the complexity of load balancing with a reinforcement learning algorithm that optimizes simulated profiles of networked robots; outperforming several baselines including round robin, least connections, and least model time with 68.30\% and 14.04\% decrease in round-trip latency time across models compared to the worst and the next best baseline respectively. Details and updates are available at: \url{https://sites.google.com/view/rilaas.} },
 author = {Ajay Kumar Tanwani and Raghav Anand and Joseph E. Gonzalez and Ken Goldberg},
 bdsk-url-1 = {https://ieeexplore.ieee.org/document/9103220},
 date-modified = {2020-08-02 11:27:35 -0700},
 journal = {IEEE Robotics and Automation Letters},
 keywords = {peerrev},
 number = {3},
 pages = {4423--4430},
 title = { {RILaaS}: Robot Inference and Learning as a Service},
 url = {https://ieeexplore.ieee.org/document/9103220},
 volume = {5},
 year = {2020}
}

Brijen Thananjeyan, Ashwin Balakrishna, Ugo Rosolia, Felix Li, Rowan McAllister, Joseph E. Gonzalez, Sergey Levine, Francesco Borrelli, and Ken Goldberg. "Safety Augmented Value Estimation From Demonstrations (SAVED): Safe Deep Model-Based RL for Sparse Cost Robotic Tasks." IEEE Robotics Autom. Lett., 2020.

Reinforcement learning (RL) for robotics is challenging due to the difficulty in hand-engineering a dense cost function, which can lead to unintended behavior, and dynamical uncertainty, which makes exploration and constraint satisfaction challenging. We address these issues with a new model-based reinforcement learning algorithm, Safety Augmented Value Estimation from Demonstrations (SAVED), which uses supervision that only identifies task completion and a modest set of suboptimal demonstrations to constrain exploration and learn efficiently while handling complex constraints. We then compare SAVED with 3 state-of-the-art model-based and model-free RL algorithms on 6 standard simulation benchmarks involving navigation and manipulation and a physical knot-tying task on the da Vinci surgical robot. Results suggest that SAVED outperforms prior methods in terms of success rate, constraint satisfaction, and sample efficiency, making it feasible to safely learn a control policy directly on a real robot in less than an hour. For tasks on the robot, baselines succeed less than 5\% of the time while SAVED has a success rate of over 75\% in the first 50 training iterations. Code and supplementary material is available

@article{SAVED20,
 abstract = {Reinforcement learning (RL) for robotics is challenging due to the difficulty in hand-engineering a dense cost function, which can lead to unintended behavior, and dynamical uncertainty, which makes exploration and constraint satisfaction challenging. We address these issues with a new model-based reinforcement learning algorithm, Safety Augmented Value Estimation from Demonstrations (SAVED), which uses supervision that only identifies task completion and a modest set of suboptimal demonstrations to constrain exploration and learn efficiently while handling complex constraints. We then compare SAVED with 3 state-of-the-art model-based and model-free RL algorithms on 6 standard simulation benchmarks involving navigation and manipulation and a physical knot-tying task on the da Vinci surgical robot. Results suggest that SAVED outperforms prior methods in terms of success rate, constraint satisfaction, and sample efficiency, making it feasible to safely learn a control policy directly on a real robot in less than an hour. For tasks on the robot, baselines succeed less than 5\% of the time while SAVED has a success rate of over 75\% in the first 50 training iterations. Code and supplementary material is available},
 author = {Brijen Thananjeyan and Ashwin Balakrishna and Ugo Rosolia and Felix Li and Rowan McAllister and Joseph E. Gonzalez and Sergey Levine and Francesco Borrelli and Ken Goldberg},
 bdsk-url-1 = {https://arxiv.org/abs/1905.13402},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/journals/ral/ThananjeyanBRLM20.bib},
 date-modified = {2020-08-02 11:27:35 -0700},
 journal = { {IEEE} Robotics Autom. Lett.},
 keywords = {peerrev, selected},
 number = {2},
 pages = {3612--3619},
 timestamp = {Fri, 22 May 2020 21:54:18 +0200},
 title = {Safety Augmented Value Estimation From Demonstrations {(SAVED):} Safe Deep Model-Based {RL} for Sparse Cost Robotic Tasks},
 url = {https://arxiv.org/abs/1905.13402},
 volume = {5},
 year = {2020}
}

Alvin Wan, Daniel Ho, Younjin Song, Henk Tillman, Sarah Adel Bargal, and Joseph E. Gonzalez. "SegNBDT: Visual Decision Rules for Segmentation." CoRR (arXiv), 2020.

The black-box nature of neural networks limits model decision interpretability, in particular for high-dimensional inputs in computer vision and for dense pixel prediction tasks like segmentation. To address this, prior work combines neural networks with decision trees. However, such models (1) perform poorly when compared to state-of-the-art segmentation models or (2) fail to produce decision rules with spatially-grounded semantic meaning. In this work, we build a hybrid neural-network and decision-tree model for segmentation that (1) attains neural network segmentation accuracy and (2) provides semi-automatically constructed visual decision rules such as ''Is there a window?.'' We obtain semantic visual meaning by extending saliency methods to segmentation and attain accuracy by leveraging insights from neural-backed decision trees, a deep learning analog of decision trees for image classification. Our model SegNBDT attains accuracy within ~2-4\% of the state-of-the-art HRNetV2 segmentation model while also retaining explainability; we achieve state-of-the-art performance for explainable models on three benchmark datasets -- Pascal-Context (49.12\%), Cityscapes (79.01\%), and Look Into Person (51.64\%). Furthermore, user studies suggest visual decision rules are more interpretable, particularly for incorrect predictions. Code and pretrained models can be found at this https URL.

@article{wan2020segnbdt,
 abstract = {The black-box nature of neural networks limits model decision interpretability, in particular for high-dimensional inputs in computer vision and for dense pixel prediction tasks like segmentation. To address this, prior work combines neural networks with decision trees. However, such models (1) perform poorly when compared to state-of-the-art segmentation models or (2) fail to produce decision rules with spatially-grounded semantic meaning. In this work, we build a hybrid neural-network and decision-tree model for segmentation that (1) attains neural network segmentation accuracy and (2) provides semi-automatically constructed visual decision rules such as ''Is there a window?.'' We obtain semantic visual meaning by extending saliency methods to segmentation and attain accuracy by leveraging insights from neural-backed decision trees, a deep learning analog of decision trees for image classification. Our model SegNBDT attains accuracy within ~2-4\% of the state-of-the-art HRNetV2 segmentation model while also retaining explainability; we achieve state-of-the-art performance for explainable models on three benchmark datasets -- Pascal-Context (49.12\%), Cityscapes (79.01\%), and Look Into Person (51.64\%). Furthermore, user studies suggest visual decision rules are more interpretable, particularly for incorrect predictions. Code and pretrained models can be found at this https URL.},
 archiveprefix = {arXiv},
 author = {Alvin Wan and Daniel Ho and Younjin Song and Henk Tillman and Sarah Adel Bargal and Joseph E. Gonzalez},
 bdsk-url-1 = {https://arxiv.org/abs/2006.06868},
 code = {https://github.com/daniel-ho/SegNBDT},
 date-modified = {2020-08-02 11:27:35 -0700},
 eprint = {2006.06868},
 journal = {CoRR},
 keywords = {arxivpre},
 primaryclass = {cs.CV},
 title = { {SegNBDT}: Visual Decision Rules for Segmentation},
 url = {https://arxiv.org/abs/2006.06868},
 year = {2020}
}

Samvit Jain, Xun Zhang, Yuhao Zhou, Ganesh Ananthanarayanan, Junchen Jiang, Yuanchao Shu, Paramvir Bahl, and Joseph Gonzalez. "Spatula: Efficient cross-camera video analytics on large camera networks." 5th IEEE/ACM Symposium on Edge Computing, SEC 2020, San Jose, CA, USA, November 12-14, 2020, 2020.

Cameras are deployed at scale with the purpose of searching and tracking objects of interest (e.g., a suspected person) through the camera network on live videos. Such cross-camera analytics is data and compute intensive, whose costs grow with the number of cameras and time. We present Spatula, a cost-efficient system that enables scaling cross-camera analytics on edge compute boxes to large camera networks by leveraging the spatial and temporal cross-camera correlations. While such correlations have been used in computer vision community, Spatula uses them to drastically reduce the communication and computation costs by pruning search space of a query identity (e.g., ignoring frames not correlated with the query identity's current position). Spatula provides the first system substrate on which cross-camera analytics applications can be built to efficiently harness the cross-camera correlations that are abundant in large camera deployments. Spatula reduces compute load by 8.3x on an 8-camera dataset, and by 23x-86x on two datasets with hundreds of cameras (simulated from real vehicle/pedestrian traces). We have also implemented Spatula on a testbed of 5 AWS DeepLens cameras.

@inproceedings{JainZZAJSBG20,
 abstract = {Cameras are deployed at scale with the purpose of searching and tracking objects of interest (e.g., a suspected person) through the camera network on live videos. Such cross-camera analytics is data and compute intensive, whose costs grow with the number of cameras and time. We present Spatula, a cost-efficient system that enables scaling cross-camera analytics on edge compute boxes to large camera networks by leveraging the spatial and temporal cross-camera correlations. While such correlations have been used in computer vision community, Spatula uses them to drastically reduce the communication and computation costs by pruning search space of a query identity (e.g., ignoring frames not correlated with the query identity's current position). Spatula provides the first system substrate on which cross-camera analytics applications can be built to efficiently harness the cross-camera correlations that are abundant in large camera deployments. Spatula reduces compute load by 8.3x on an 8-camera dataset, and by 23x-86x on two datasets with hundreds of cameras (simulated from real vehicle/pedestrian traces). We have also implemented Spatula on a testbed of 5 AWS DeepLens cameras.},
 author = {Samvit Jain and Xun Zhang and Yuhao Zhou and Ganesh Ananthanarayanan and Junchen Jiang and Yuanchao Shu and Paramvir Bahl and Joseph Gonzalez},
 bdsk-url-1 = {https://doi.org/10.1109/SEC50012.2020.00016},
 booktitle = {5th {IEEE/ACM} Symposium on Edge Computing, {SEC} 2020, San Jose, CA, USA, November 12-14, 2020},
 doi = {10.1109/SEC50012.2020.00016},
 keywords = {peerrev},
 pages = {110--124},
 publisher = { {IEEE} },
 title = {Spatula: Efficient cross-camera video analytics on large camera networks},
 url = {https://doi.org/10.1109/SEC50012.2020.00016},
 year = {2020}
}

Bohan Zhai, Tianren Gao, Flora Xue, Daniel Rothchild, Bichen Wu, Joseph E. Gonzalez, and Kurt Keutzer. "SqueezeWave: Extremely Lightweight Vocoders for On-device Speech Synthesis." CoRR (arXiv), 2020.

Automatic speech synthesis is a challenging task that is becoming increasingly important as edge devices begin to interact with users through speech. Typical text-to-speech pipelines include a vocoder, which translates intermediate audio representations into an audio waveform. Most existing vocoders are difficult to parallelize since each generated sample is conditioned on previous samples. WaveGlow is a flow-based feed-forward alternative to these auto-regressive models (Prenger et al., 2019). However, while WaveGlow can be easily parallelized, the model is too expensive for real-time speech synthesis on the edge. This paper presents SqueezeWave, a family of lightweight vocoders based on WaveGlow that can generate audio of similar quality to WaveGlow with 61x - 214x fewer MACs. Code, trained models, and generated audio are publicly available at this https URL.

@article{SqueezeWave20,
 abstract = {Automatic speech synthesis is a challenging task that is becoming increasingly important as edge devices begin to interact with users through speech. Typical text-to-speech pipelines include a vocoder, which translates intermediate audio representations into an audio waveform. Most existing vocoders are difficult to parallelize since each generated sample is conditioned on previous samples. WaveGlow is a flow-based feed-forward alternative to these auto-regressive models (Prenger et al., 2019). However, while WaveGlow can be easily parallelized, the model is too expensive for real-time speech synthesis on the edge. This paper presents SqueezeWave, a family of lightweight vocoders based on WaveGlow that can generate audio of similar quality to WaveGlow with 61x - 214x fewer MACs. Code, trained models, and generated audio are publicly available at this https URL.},
 archiveprefix = {arXiv},
 author = {Bohan Zhai and Tianren Gao and Flora Xue and Daniel Rothchild and Bichen Wu and Joseph E. Gonzalez and Kurt Keutzer},
 bdsk-url-1 = {https://arxiv.org/abs/2001.05685},
 date-modified = {2020-08-02 11:27:35 -0700},
 eprint = {2001.05685},
 journal = {CoRR},
 keywords = {arxivpre},
 title = {SqueezeWave: Extremely Lightweight Vocoders for On-device Speech Synthesis},
 url = {https://arxiv.org/abs/2001.05685},
 volume = {abs/2001.05685},
 year = {2020}
}

Devin Petersohn, William W. Ma, Doris Jung Lin Lee, Stephen Macke, Doris Xin, Xiangxi Mo, Joseph E. Gonzalez, Joseph M. Hellerstein, Anthony D. Joseph, and Aditya G. Parameswaran. "Towards Scalable Dataframe Systems." Proceedings of Very Large Data Bases (PVLDB), 2020.

Dataframes are a popular abstraction to represent, prepare, and analyze data. Despite the remarkable success of dataframe libraries in Rand Python, dataframes face performance issues even on moderately large datasets. Moreover, there is significant ambiguity regarding dataframe semantics. In this paper we lay out a vision and roadmap for scalable dataframe systems. To demonstrate the potential in this area, we report on our experience building MODIN, a scaled-up implementation of the most widely-used and complex dataframe API today, Python's pandas. With pandas as a reference, we propose a simple data model and algebra for dataframes to ground discussion in the field. Given this foundation, we lay out an agenda of open research opportunities where the distinct features of dataframes will require extending the state of the art in many dimensions of data management. We discuss the implications of signature data-frame features including flexible schemas, ordering, row/column equivalence, and data/metadata fluidity, as well as the piecemeal, trial-and-error-based approach to interacting with dataframes.

@inproceedings{Modin20,
 abstract = {Dataframes are a popular abstraction to represent, prepare, and analyze data. Despite the remarkable success of dataframe libraries in Rand Python, dataframes face performance issues even on moderately large datasets. Moreover, there is significant ambiguity regarding dataframe semantics. In this paper we lay out a vision and roadmap for scalable dataframe systems. To demonstrate the potential in this area, we report on our experience building MODIN, a scaled-up implementation of the most widely-used and complex dataframe API today, Python's pandas. With pandas as a reference, we propose a simple data model and algebra for dataframes to ground discussion in the field. Given this foundation, we lay out an agenda of open research opportunities where the distinct features of dataframes will require extending the state of the art in many dimensions of data management. We discuss the implications of signature data-frame features including flexible schemas, ordering, row/column equivalence, and data/metadata fluidity, as well as the piecemeal, trial-and-error-based approach to interacting with dataframes.},
 author = {Devin Petersohn and William W. Ma and Doris Jung Lin Lee and Stephen Macke and Doris Xin and Xiangxi Mo and Joseph E. Gonzalez and Joseph M. Hellerstein and Anthony D. Joseph and Aditya G. Parameswaran},
 bdsk-url-1 = {http://www.vldb.org/pvldb/vol13/p2033-petersohn.pdf},
 booktitle = {Proceedings of Very Large Data Bases (PVLDB)},
 code = {https://github.com/modin-project/modin},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 title = {Towards Scalable Dataframe Systems},
 url = {http://www.vldb.org/pvldb/vol13/p2033-petersohn.pdf},
 volume = {13},
 year = {2020}
}

Samvit Jain, Xin Wang, and Joseph Gonzalez. "Accel: A Corrective Fusion Network for Efficient Semantic Segmentation on Video." The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2019.

We present Accel, a novel semantic video segmentation system that achieves high accuracy at low inference cost by combining the predictions of two network branches: (1) a reference branch that extracts high-detail features on a reference keyframe, and warps these features forward using frame-to-frame optical flow estimates, and (2) an update branch that computes features of adjustable quality on the current frame, performing a temporal update at each video frame. The modularity of the update branch, where feature subnetworks of varying layer depth can be inserted (e.g. ResNet-18 to ResNet-101), enables operation over a new, state-of-the-art accuracy-throughput trade-off spectrum. Over this curve, Accel models achieve both higher accuracy and faster inference times than the closest comparable single-frame segmentation networks. In general, Accel significantly outperforms previous work on efficient semantic video segmentation, correcting warping-related error that compounds on datasets with complex dynamics. Accel is end-to-end trainable and highly modular: the reference network, the optical flow network, and the update network can each be selected independently, depending on application requirements, and then jointly fine-tuned. The result is a robust, general system for fast, high-accuracy semantic segmentation on video.

@inproceedings{Accel19,
 abstract = {We present Accel, a novel semantic video segmentation system that achieves high accuracy at low inference cost by combining the predictions of two network branches: (1) a reference branch that extracts high-detail features on a reference keyframe, and warps these features forward using frame-to-frame optical flow estimates, and (2) an update branch that computes features of adjustable quality on the current frame, performing a temporal update at each video frame. The modularity of the update branch, where feature subnetworks of varying layer depth can be inserted (e.g. ResNet-18 to ResNet-101), enables operation over a new, state-of-the-art accuracy-throughput trade-off spectrum. Over this curve, Accel models achieve both higher accuracy and faster inference times than the closest comparable single-frame segmentation networks. In general, Accel significantly outperforms previous work on efficient semantic video segmentation, correcting warping-related error that compounds on datasets with complex dynamics. Accel is end-to-end trainable and highly modular: the reference network, the optical flow network, and the update network can each be selected independently, depending on application requirements, and then jointly fine-tuned. The result is a robust, general system for fast, high-accuracy semantic segmentation on video.},
 author = {Samvit Jain and Xin Wang and Joseph Gonzalez},
 bdsk-url-1 = {http://arxiv.org/abs/1807.06667},
 booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {6},
 title = {Accel: {A} Corrective Fusion Network for Efficient Semantic Segmentation on Video},
 url = {http://arxiv.org/abs/1807.06667},
 year = {2019}
}

Xin Wang, Fisher Yu, Ruth Wang, Trevor Darrell, and Joseph E. Gonzalez. "TAFE-Net: Task-Aware Feature Embeddings for Low Shot Learning." The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2019.

Learning good feature embeddings for images often requires substantial training data. As a consequence, in settings where training data is limited (e.g., few-shot and zero-shot learning), we are typically forced to use a generic feature embedding across various tasks. Ideally, we want to construct feature embeddings that are tuned for the given task. In this work, we propose Task-Aware Feature Embedding Networks (TAFE-Nets) to learn how to adapt the image representation to a new task in a meta learning fashion. Our network is composed of a meta learner and a prediction network. Based on a task input, the meta learner generates parameters for the feature layers in the prediction network so that the feature embedding can be accurately adjusted for that task. We show that TAFE-Net is highly effective in generalizing to new tasks or concepts and evaluate the TAFE-Net on a range of benchmarks in zero-shot and few-shot learning. Our model matches or exceeds the state-of-the-art on all tasks. In particular, our approach improves the prediction accuracy of unseen attribute-object pairs by 4 to 15 points on the challenging visual attribute-object composition task.

@inproceedings{Tafe19,
 abstract = {Learning good feature embeddings for images often requires substantial training data. As a consequence, in settings where training data is limited (e.g., few-shot and zero-shot learning), we are typically forced to use a generic feature embedding across various tasks. Ideally, we want to construct feature embeddings that are tuned for the given task. In this work, we propose Task-Aware Feature Embedding Networks (TAFE-Nets) to learn how to adapt the image representation to a new task in a meta learning fashion. Our network is composed of a meta learner and a prediction network. Based on a task input, the meta learner generates parameters for the feature layers in the prediction network so that the feature embedding can be accurately adjusted for that task. We show that TAFE-Net is highly effective in generalizing to new tasks or concepts and evaluate the TAFE-Net on a range of benchmarks in zero-shot and few-shot learning. Our model matches or exceeds the state-of-the-art on all tasks. In particular, our approach improves the prediction accuracy of unseen attribute-object pairs by 4 to 15 points on the challenging visual attribute-object composition task.},
 author = {Xin Wang and Fisher Yu and Ruth Wang and Trevor Darrell and Joseph E. Gonzalez},
 bdsk-url-1 = {https://arxiv.org/abs/1904.05967},
 booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
 code = {https://github.com/ucbdrive/tafe-net},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {6},
 title = { {TAFE-Net}: Task-Aware Feature Embeddings for Low Shot Learning},
 url = {https://arxiv.org/abs/1904.05967},
 year = {2019}
}

Eric Jonas, Johann Schleier-Smith, Vikram Sreekanti, Chia-Che Tsai, Anurag Khandelwal, Qifan Pu, Vaishaal Shankar, Joao Menezes Carreira, Karl Krauth, Neeraja Yadwadkar, Joseph E. Gonzalez, Raluca Ada Popa, Ion Stoica, and David A. Patterson. "Cloud Programming Simplified: A Berkeley View on Serverless Computing." EECS Department, University of California, Berkeley Technical Report, 2019.

Serverless cloud computing handles virtually all the system administration operations needed to make it easier for programmers to use the cloud. It provides an interface that greatly simplifies cloud programming, and represents an evolution that parallels the transition from assembly language to high-level programming languages. This paper gives a quick history of cloud computing, including an accounting of the predictions of the 2009 Berkeley View of Cloud Computing paper, explains the motivation for serverless computing, describes applications that stretch the current limits of serverless, and then lists obstacles and research opportunities required for serverless computing to fulfill its full potential. Just as the 2009 paper identified challenges for the cloud and predicted they would be addressed and that cloud use would accelerate, we predict these issues are solvable and that serverless computing will grow to dominate the future of cloud computing.

@techreport{Jonas2019,
 abstract = {Serverless cloud computing handles virtually all the system administration operations needed to make it easier for programmers to use the cloud. It provides an interface that greatly simplifies cloud programming, and represents an evolution that parallels the transition from assembly language to high-level programming languages. This paper gives a quick history of cloud computing, including an accounting of the predictions of the 2009 Berkeley View of Cloud Computing paper, explains the motivation for serverless computing, describes applications that stretch the current limits of serverless, and then lists obstacles and research opportunities required for serverless computing to fulfill its full potential. Just as the 2009 paper identified challenges for the cloud and predicted they would be addressed and that cloud use would accelerate, we predict these issues are solvable and that serverless computing will grow to dominate the future of cloud computing.},
 author = {Eric Jonas and Johann Schleier-Smith and Vikram Sreekanti and Chia-Che Tsai and Anurag Khandelwal and Qifan Pu and Vaishaal Shankar and Joao Menezes Carreira and Karl Krauth and Neeraja Yadwadkar and Joseph E. Gonzalez and Raluca Ada Popa and Ion Stoica and David A. Patterson},
 bdsk-url-1 = {http://www2.eecs.berkeley.edu/Pubs/TechRpts/2019/EECS-2019-3.html},
 date-modified = {2020-08-02 11:27:35 -0700},
 institution = {EECS Department, University of California, Berkeley},
 keywords = {techreport},
 month = {2},
 number = {UCB/EECS-2019-3},
 title = {Cloud Programming Simplified: A Berkeley View on Serverless Computing},
 url = {http://www2.eecs.berkeley.edu/Pubs/TechRpts/2019/EECS-2019-3.html},
 year = {2019}
}

Samvit Jain, Ganesh Ananthanarayanan, Junchen Jiang, Yuanchao Shu, and Joseph E. Gonzalez. "Scaling Video Analytics Systems to Large Camera Deployments." HotMobile `19, Proceedings of the 20th International Workshop on Mobile Computing Systems and Applications, 2019.

Driven by advances in computer vision and the falling costs of camera hardware, organizations are deploying video cameras en masse for the spatial monitoring of their physical premises. Scaling video analytics to massive camera deployments, however, presents a new and mounting challenge, as compute cost grows proportionally to the number of camera feeds. This paper is driven by a simple question: can we scale video analytics in such a way that cost grows sublinearly, or even remains constant, as we deploy more cameras, while inference accuracy remains stable, or even improves. We believe the answer is yes. Our key observation is that video feeds from wide-area camera deployments demonstrate significant content correlations (e.g. to other geographically proximate feeds), both in space and over time. These spatio-temporal correlations can be harnessed to dramatically reduce the size of the inference search space, decreasing both workload and false positive rates in multi-camera video analytics. By discussing use-cases and technical challenges, we propose a roadmap for scaling video analytics to large camera networks, and outline a plan for its realization.

@inproceedings{Hotmobil2019,
 abstract = {Driven by advances in computer vision and the falling costs of camera hardware, organizations are deploying video cameras en masse for the spatial monitoring of their physical premises. Scaling video analytics to massive camera deployments, however, presents a new and mounting challenge, as compute cost grows proportionally to the number of camera feeds. This paper is driven by a simple question: can we scale video analytics in such a way that cost grows sublinearly, or even remains constant, as we deploy more cameras, while inference accuracy remains stable, or even improves. We believe the answer is yes. Our key observation is that video feeds from wide-area camera deployments demonstrate significant content correlations (e.g. to other geographically proximate feeds), both in space and over time. These spatio-temporal correlations can be harnessed to dramatically reduce the size of the inference search space, decreasing both workload and false positive rates in multi-camera video analytics. By discussing use-cases and technical challenges, we propose a roadmap for scaling video analytics to large camera networks, and outline a plan for its realization.},
 author = {Samvit Jain and Ganesh Ananthanarayanan and Junchen Jiang and Yuanchao Shu and Joseph E. Gonzalez},
 bdsk-url-1 = {https://arxiv.org/abs/1809.02318},
 booktitle = {HotMobile `19, Proceedings of the 20th International Workshop on Mobile Computing Systems and Applications},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {2},
 title = {Scaling Video Analytics Systems to Large Camera Deployments},
 url = {https://arxiv.org/abs/1809.02318},
 year = {2019}
}

Zuxuan Wu, Xin Wang, Joseph E. Gonzalez, Tom Goldstein, and Larry S. Davis. "ACE: Adapting to Changing Environments for Semantic Segmentation." International Conference in Computer Vision (ICCV), 2019.

Deep neural networks exhibit exceptional accuracy when they are trained and tested on the same data distributions. However, neural classifiers are often extremely brittle when confronted with domain shift---changes in the input distribution that occur over time. We present ACE, a framework for semantic segmentation that dynamically adapts to changing environments over the time. By aligning the distribution of labeled training data from the original source domain with the distribution of incoming data in a shifted domain, ACE synthesizes labeled training data for environments as it sees them. This stylized data is then used to update a segmentation model so that it performs well in new environments. To avoid forgetting knowledge from past environments, we introduce a memory that stores feature statistics from previously seen domains. These statistics can be used to replay images in any of the previously observed domains, thus preventing catastrophic forgetting. In addition to standard batch training using stochastic gradient decent (SGD), we also experiment with fast adaptation methods based on adaptive meta-learning. Extensive experiments are conducted on two datasets from SYNTHIA, the results demonstrate the effectiveness of the proposed approach when adapting to a number of tasks.

@inproceedings{WangICCV19,
 abstract = {Deep neural networks exhibit exceptional accuracy when they are trained and tested on the same data distributions. However, neural classifiers are often extremely brittle when confronted with domain shift---changes in the input distribution that occur over time. We present ACE, a framework for semantic segmentation that dynamically adapts to changing environments over the time. By aligning the distribution of labeled training data from the original source domain with the distribution of incoming data in a shifted domain, ACE synthesizes labeled training data for environments as it sees them. This stylized data is then used to update a segmentation model so that it performs well in new environments. To avoid forgetting knowledge from past environments, we introduce a memory that stores feature statistics from previously seen domains. These statistics can be used to replay images in any of the previously observed domains, thus preventing catastrophic forgetting. In addition to standard batch training using stochastic gradient decent (SGD), we also experiment with fast adaptation methods based on adaptive meta-learning. Extensive experiments are conducted on two datasets from SYNTHIA, the results demonstrate the effectiveness of the proposed approach when adapting to a number of tasks.},
 author = {Zuxuan Wu and Xin Wang and Joseph E. Gonzalez and Tom Goldstein and Larry S. Davis},
 bdsk-url-1 = {http://arxiv.org/abs/1904.06268},
 booktitle = {International Conference in Computer Vision (ICCV)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {10},
 title = { {ACE}: Adapting to Changing Environments for Semantic Segmentation},
 url = {http://arxiv.org/abs/1904.06268},
 year = {2019}
}

Joseph M. Hellerstein, Jose M. Faleiro, Joseph E. Gonzalez, Johann Schleier-Smith, Vikram Sreekanti, Alexey Tumanov, and Chenggang Wu. "Serverless Computing: One Step Forward, Two Steps Back." Conference on Innovative Data Systems Research (CIDR '19), 2019.

Serverless computing offers the potential to program the cloud in an autoscaling, pay-as-you go manner. In this paper we address critical gaps in first-generation serverless computing, which place its autoscaling potential at odds with dominant trends in modern computing: notably data-centric and distributed computing, but also open source and custom hardware. Put together, these gaps make current serverless offerings a bad fit for cloud innovation and particularly bad for data systems innovation. In addition to pinpointing some of the main shortfalls of current serverless architectures, we raise a set of challenges we believe must be met to unlock the radical potential that the cloud---with its exabytes of storage and millions of cores---should offer to innovative developers.

@inproceedings{cidr19,
 abstract = {Serverless computing offers the potential to program the cloud in an autoscaling, pay-as-you go manner. In this paper we address critical gaps in first-generation serverless computing, which place its autoscaling potential at odds with dominant trends in modern computing: notably data-centric and distributed computing, but also open source and custom hardware. Put together, these gaps make current serverless offerings a bad fit for cloud innovation and particularly bad for data systems innovation. In addition to pinpointing some of the main shortfalls of current serverless architectures, we raise a set of challenges we believe must be met to unlock the radical potential that the cloud---with its exabytes of storage and millions of cores---should offer to innovative developers.},
 author = {Joseph M. Hellerstein and Jose M. Faleiro and Joseph E. Gonzalez and Johann Schleier{-}Smith and Vikram Sreekanti and Alexey Tumanov and Chenggang Wu},
 bdsk-url-1 = {https://arxiv.org/abs/1812.03651},
 booktitle = {Conference on Innovative Data Systems Research ({CIDR} '19)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {1},
 title = {Serverless Computing: One Step Forward, Two Steps Back},
 url = {https://arxiv.org/abs/1812.03651},
 year = {2019}
}

Ajay Kumar Tanwani, Nitesh Mor, John Kubiatowicz, Joseph E. Gonzalez, and Ken Goldberg. "A Fog Robotics Approach to Deep Robot Learning: Application to Object Recognition and Grasp Planning in Surface Decluttering." International Conference on Robotics and Automation, ICRA 2019, Montreal, QC, Canada, May 20-24, 2019, 2019.

The growing demand of industrial, automotive and service robots presents a challenge to the centralized Cloud Robotics model in terms of privacy, security, latency, bandwidth, and reliability. In this paper, we present a `Fog Robotics' approach to deep robot learning that distributes compute, storage and networking resources between the Cloud and the Edge in a federated manner. Deep models are trained on non-private (public) synthetic images in the Cloud; the models are adapted to the private real images of the environment at the Edge within a trusted network and subsequently, deployed as a service for low-latency and secure inference/prediction for other robots in the network. We apply this approach to surface decluttering, where a mobile robot picks and sorts objects from a cluttered floor by learning a deep object recognition and a grasp planning model. Experiments suggest that Fog Robotics can improve performance by sim-to-real domain adaptation in comparison to exclusively using Cloud or Edge resources, while reducing the inference cycle time by 4\times to successfully declutter 86\% of objects over 213 attempts.

@inproceedings{Tanwani19,
 abstract = {The growing demand of industrial, automotive and service robots presents a challenge to the centralized Cloud Robotics model in terms of privacy, security, latency, bandwidth, and reliability. In this paper, we present a `Fog Robotics' approach to deep robot learning that distributes compute, storage and networking resources between the Cloud and the Edge in a federated manner. Deep models are trained on non-private (public) synthetic images in the Cloud; the models are adapted to the private real images of the environment at the Edge within a trusted network and subsequently, deployed as a service for low-latency and secure inference/prediction for other robots in the network. We apply this approach to surface decluttering, where a mobile robot picks and sorts objects from a cluttered floor by learning a deep object recognition and a grasp planning model. Experiments suggest that Fog Robotics can improve performance by sim-to-real domain adaptation in comparison to exclusively using Cloud or Edge resources, while reducing the inference cycle time by 4\times to successfully declutter 86\% of objects over 213 attempts.},
 author = {Ajay Kumar Tanwani and Nitesh Mor and John Kubiatowicz and Joseph E. Gonzalez and Ken Goldberg},
 bdsk-url-1 = {https://doi.org/10.1109/ICRA.2019.8793690},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/bib/conf/icra/TanwaniMKGG19},
 booktitle = {International Conference on Robotics and Automation, {ICRA} 2019, Montreal, QC, Canada, May 20-24, 2019},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 pages = {4559--4566},
 timestamp = {Tue, 13 Aug 2019 20:25:20 +0200},
 title = {A Fog Robotics Approach to Deep Robot Learning: Application to Object Recognition and Grasp Planning in Surface Decluttering},
 url = {https://doi.org/10.1109/ICRA.2019.8793690},
 year = {2019}
}

Tianjun Zhang, Zhewei Yao, Amir Gholami, Kurt Keutzer, Joseph E. Gonzalez, George Biros, and Michael W. Mahoney. "ANODEV2: A Coupled Neural ODE Evolution Framework." Neural Information Processing Systems (NeurIPS), 2019.

It has been observed that residual networks can be viewed as the explicit Euler discretization of an Ordinary Differential Equation (ODE). This observation motivated the introduction of so-called Neural ODEs, which allow more general discretization schemes with adaptive time stepping. Here, we propose ANODEV2, which is an extension of this approach that also allows evolution of the neural network parameters, in a coupled ODE-based formulation. The Neural ODE method introduced earlier is in fact a special case of this new more general framework. We present the formulation of ANODEV2, derive optimality conditions, and implement a coupled reaction-diffusion-advection version of this framework in PyTorch. We present empirical results using several different configurations of ANODEV2, testing them on multiple models on CIFAR-10. We report results showing that this coupled ODE-based framework is indeed trainable, and that it achieves higher accuracy, as compared to the baseline models as well as the recently-proposed Neural ODE approach.

@inproceedings{Zhang19,
 abstract = {It has been observed that residual networks can be viewed as the explicit Euler discretization of an Ordinary Differential Equation (ODE). This observation motivated the introduction of so-called Neural ODEs, which allow more general discretization schemes with adaptive time stepping. Here, we propose ANODEV2, which is an extension of this approach that also allows evolution of the neural network parameters, in a coupled ODE-based formulation. The Neural ODE method introduced earlier is in fact a special case of this new more general framework. We present the formulation of ANODEV2, derive optimality conditions, and implement a coupled reaction-diffusion-advection version of this framework in PyTorch. We present empirical results using several different configurations of ANODEV2, testing them on multiple models on CIFAR-10. We report results showing that this coupled ODE-based framework is indeed trainable, and that it achieves higher accuracy, as compared to the baseline models as well as the recently-proposed Neural ODE approach.},
 author = {Tianjun Zhang and Zhewei Yao and Amir Gholami and Kurt Keutzer and Joseph E. Gonzalez and George Biros and Michael W. Mahoney},
 bdsk-url-1 = {https://arxiv.org/abs/1906.04596},
 booktitle = {Neural Information Processing Systems ({NeurIPS})},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 title = { {ANODEV2:} A Coupled Neural ODE Evolution Framework},
 url = {https://arxiv.org/abs/1906.04596},
 year = {2019}
}

Vidit Saxena, Joakim Jald\'en, Joseph E. Gonzalez, Mats Bengtsson, Hugo M. Tullberg, and Ion Stoica. "Contextual Multi-Armed Bandits for Link Adaptation in Cellular Networks." Proceedings of the Workshop on Network Meets AI (NetAI) at SIGCOMM, 2019.

Cellular networks dynamically adjust the transmission parameters for a wireless link in response to its time-varying channel state. This is known as link adaptation, where the typical goal is to maximize the link throughput. State-of-the-art outer loop link adaptation (OLLA) selects the optimal transmission parameters based on an approximate, offline, model of the wireless link. Further, OLLA refines the offline model by dynamically compensating any deviations from the observed link performance. However, in practice, OLLA suffers from slow convergence and a sub-optimal link throughput. In this paper, we propose a link adaptation approach that overcomes the shortcomings of OLLA through a novel learning scheme. Our approach relies on contextual multi-armed bandits (MAB), where the context vector is composed of the instantaneous wireless channel state along with side information about the link. For a given context, our approach learns the success probability for each of the available transmission parameters, which is then exploited to select the throughput-maximizing parameters. Through numerical experiments, we show that our approach converges faster than OLLA and achieves a higher steady-state link throughput. For frequent and infrequent channel reports respectively, our scheme outperforms OLLA by 15\% and 25\% in terms of the steady-state link throughput.

@inproceedings{SaxenaSigcommNetAI,
 abstract = {Cellular networks dynamically adjust the transmission parameters for a wireless link in response to its time-varying channel state. This is known as link adaptation, where the typical goal is to maximize the link throughput. State-of-the-art outer loop link adaptation (OLLA) selects the optimal transmission parameters based on an approximate, offline, model of the wireless link. Further, OLLA refines the offline model by dynamically compensating any deviations from the observed link performance. However, in practice, OLLA suffers from slow convergence and a sub-optimal link throughput. In this paper, we propose a link adaptation approach that overcomes the shortcomings of OLLA through a novel learning scheme. Our approach relies on contextual multi-armed bandits (MAB), where the context vector is composed of the instantaneous wireless channel state along with side information about the link. For a given context, our approach learns the success probability for each of the available transmission parameters, which is then exploited to select the throughput-maximizing parameters. Through numerical experiments, we show that our approach converges faster than OLLA and achieves a higher steady-state link throughput. For frequent and infrequent channel reports respectively, our scheme outperforms OLLA by 15\% and 25\% in terms of the steady-state link throughput.},
 author = {Vidit Saxena and Joakim Jald{\'{e} }n and Joseph E. Gonzalez and Mats Bengtsson and Hugo M. Tullberg and Ion Stoica},
 bdsk-url-1 = {https://doi.org/10.1145/3341216.3342212},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/bib/conf/sigcomm/SaxenaJGBTS19},
 booktitle = {Proceedings of the Workshop on Network Meets {AI} ({NetAI}) at SIGCOMM},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 pages = {44--49},
 timestamp = {Thu, 15 Aug 2019 09:19:24 +0200},
 title = {Contextual Multi-Armed Bandits for Link Adaptation in Cellular Networks},
 url = {https://doi.org/10.1145/3341216.3342212},
 year = {2019}
}

Xin Wang, Fisher Yu, Lisa Dunlap, Yi-An Ma, Ruth Wang, Azalia Mirhoseini, Trevor Darrell, and Joseph E. Gonzalez. "Deep Mixture of Experts via Shallow Embedding." Proceedings of the Thirty-Fifth Conference on Uncertainty in Artificial Intelligence, UAI 2019, Tel Aviv, Israel, July 22-25, 2019, 2019.

Larger networks generally have greater representational power at the cost of increased computational complexity. Sparsifying such networks has been an active area of research but has been generally limited to static regularization or dynamic approaches using reinforcement learning. We explore a mixture of experts (MoE) approach to deep dynamic routing, which activates certain experts in the network on a per-example basis. Our novel DeepMoE architecture increases the representational power of standard convolutional networks by adaptively sparsifying and recalibrating channel-wise features in each convolutional layer. We employ a multi-headed sparse gating network to determine the selection and scaling of channels for each input, leveraging exponential combinations of experts within a single convolutional network. Our proposed architecture is evaluated on four benchmark datasets and tasks, and we show that Deep-MoEs are able to achieve higher accuracy with lower computation than standard convolutional networks.

@inproceedings{DeepMoE19,
 abstract = {Larger networks generally have greater representational power at the cost of increased computational complexity. Sparsifying such networks has been an active area of research but has been generally limited to static regularization or dynamic approaches using reinforcement learning. We explore a mixture of experts (MoE) approach to deep dynamic routing, which activates certain experts in the network on a per-example basis. Our novel DeepMoE architecture increases the representational power of standard convolutional networks by adaptively sparsifying and recalibrating channel-wise features in each convolutional layer. We employ a multi-headed sparse gating network to determine the selection and scaling of channels for each input, leveraging exponential combinations of experts within a single convolutional network. Our proposed architecture is evaluated on four benchmark datasets and tasks, and we show that Deep-MoEs are able to achieve higher accuracy with lower computation than standard convolutional networks.},
 author = {Xin Wang and Fisher Yu and Lisa Dunlap and Yi{-}An Ma and Ruth Wang and Azalia Mirhoseini and Trevor Darrell and Joseph E. Gonzalez},
 bdsk-url-1 = {http://auai.org/uai2019/proceedings/papers/192.pdf},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/bib/conf/uai/WangYDMWMDG19},
 booktitle = {Proceedings of the Thirty-Fifth Conference on Uncertainty in Artificial Intelligence, {UAI} 2019, Tel Aviv, Israel, July 22-25, 2019},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 pages = {192},
 timestamp = {Fri, 19 Jul 2019 13:05:12 +0200},
 title = {Deep Mixture of Experts via Shallow Embedding},
 url = {http://auai.org/uai2019/proceedings/papers/192.pdf},
 year = {2019}
}

Wenting Zheng, Raluca Ada Popa, Joseph E. Gonzalez, and Ion Stoica. "Helen: Maliciously Secure Coopetitive Learning for Linear Models." IEEE Symposium on Security and Privacy (Oakland), 2019.

Many organizations wish to collaboratively train machine learning models on their combined datasets for a common benefit (e.g., better medical research, or fraud detection). However, they often cannot share their plaintext datasets due to privacy concerns and/or business competition. In this paper, we design and build Helen, a system that allows multiple parties to train a linear model without revealing their data, a setting we call coopetitive learning. Compared to prior secure training systems, Helen protects against a much stronger adversary who is malicious and can compromise m-1 out of m parties. Our evaluation shows that Helen can achieve up to five orders of magnitude of performance improvement when compared to training using an existing state-of-the-art secure multi-party computation framework.

@inproceedings{Helen19,
 abstract = {Many organizations wish to collaboratively train machine learning models on their combined datasets for a common benefit (e.g., better medical research, or fraud detection). However, they often cannot share their plaintext datasets due to privacy concerns and/or business competition. In this paper, we design and build Helen, a system that allows multiple parties to train a linear model without revealing their data, a setting we call coopetitive learning. Compared to prior secure training systems, Helen protects against a much stronger adversary who is malicious and can compromise m-1 out of m parties. Our evaluation shows that Helen can achieve up to five orders of magnitude of performance improvement when compared to training using an existing state-of-the-art secure multi-party computation framework.},
 author = {Wenting Zheng and Raluca Ada Popa and Joseph E. Gonzalez and Ion Stoica},
 booktitle = { {IEEE} Symposium on Security and Privacy ({Oakland})},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 publisher = { {IEEE} Computer Society},
 title = {Helen: Maliciously Secure Coopetitive Learning for Linear Models},
 url = {https://people.eecs.berkeley.edu/~wzheng/helen\%5Fieeesp.pdf},
 year = {2019}
}

Richard Liaw, Romil Bhardwaj, Lisa Dunlap, Yitian Zou, Joseph E. Gonzalez, Ion Stoica, and Alexey Tumanov. "HyperSched: Dynamic Resource Reallocation for Model Development on a Deadline." Proceedings of the ACM Symposium on Cloud Computing, 2019.

Prior research in resource scheduling for machine learning training workloads has largely focused on minimizing job completion times. Commonly, these model training workloads collectively search over a large number of parameter values that control the learning process in a hyperparameter search. It is preferable to identify and maximally provision the best-performing hyperparameter configuration (trial) to achieve the highest accuracy result as soon as possible. To optimally trade-off evaluating multiple configurations and training the most promising ones by a fixed deadline, we design and build HyperSched---a dynamic application-level resource scheduler to track, identify, and preferentially allocate resources to the best performing trials to maximize accuracy by the deadline. HyperSched leverages three properties of a hyperparameter search workload overlooked in prior work -- trial disposability, progressively identifiable rankings among different configurations, and space-time constraints -- to outperform standard hyperparameter search algorithms across a variety of benchmarks.

@inproceedings{HyperschedSOCC19,
 abstract = {
Prior research in resource scheduling for machine learning training workloads has largely focused on minimizing job completion times. Commonly, these model training workloads collectively search over a large number of parameter values that control the learning process in a hyperparameter search. It is preferable to identify and maximally provision the best-performing hyperparameter configuration (trial) to achieve the highest accuracy result as soon as possible.

To optimally trade-off evaluating multiple configurations and training the most promising ones by a fixed deadline, we design and build HyperSched---a dynamic application-level resource scheduler to track, identify, and preferentially allocate resources to the best performing trials to maximize accuracy by the deadline. HyperSched leverages three properties of a hyperparameter search workload overlooked in prior work -- trial disposability, progressively identifiable rankings among different configurations, and space-time constraints -- to outperform standard hyperparameter search algorithms across a variety of benchmarks.
},
 address = {New York, NY, USA},
 author = {Richard Liaw and Romil Bhardwaj and Lisa Dunlap and Yitian Zou and Joseph E. Gonzalez and Ion Stoica and Alexey Tumanov},
 bdsk-url-1 = {https://doi.org/10.1145/3357223.3362719},
 booktitle = {Proceedings of the ACM Symposium on Cloud Computing},
 code = {https://github.com/ucbrise/hypersched},
 date-modified = {2020-08-02 11:27:35 -0700},
 isbn = {9781450369732},
 keywords = {peerrev},
 location = {Santa Cruz, CA, USA},
 numpages = {13},
 pages = {61--73},
 publisher = {Association for Computing Machinery},
 series = {SoCC '19},
 title = { {HyperSched}: Dynamic Resource Reallocation for Model Development on a Deadline},
 url = {https://doi.org/10.1145/3357223.3362719},
 year = {2019}
}

Xin Wang, Fisher Yu, Trevor Darrell, and Joseph E. Gonzalez. "Task-Aware Feature Generation for Zero-Shot Compositional Learning." CoRR (arXiv), 2019.

Visual concepts (e.g., red apple, big elephant) are often semantically compositional and each element of the compositions can be reused to construct novel concepts (e.g., red elephant). Compositional feature synthesis, which generates image feature distributions exploiting the semantic compositionality, is a promising approach to sample-efficient model generalization. In this work, we propose a task-aware feature generation (TFG) framework for compositional learning, which generates features of novel visual concepts by transferring knowledge from previously seen concepts. These synthetic features are then used to train a classifier to recognize novel concepts in a zero-shot manner. Our novel TFG design injects task-conditioned noise layer-by-layer, producing task-relevant variation at each level. We find the proposed generator design improves classification accuracy and sample efficiency. Our model establishes a new state of the art on three zero-shot compositional learning (ZSCL) benchmarks, outperforming the previous discriminative models by a large margin. Our model improves the performance of the prior arts by over 2x in the generalized ZSCL setting.

@article{TFG19,
 abstract = {Visual concepts (e.g., red apple, big elephant) are often semantically compositional and each element of the compositions can be reused to construct novel concepts (e.g., red elephant). Compositional feature synthesis, which generates image feature distributions exploiting the semantic compositionality, is a promising approach to sample-efficient model generalization. In this work, we propose a task-aware feature generation (TFG) framework for compositional learning, which generates features of novel visual concepts by transferring knowledge from previously seen concepts. These synthetic features are then used to train a classifier to recognize novel concepts in a zero-shot manner. Our novel TFG design injects task-conditioned noise layer-by-layer, producing task-relevant variation at each level. We find the proposed generator design improves classification accuracy and sample efficiency. Our model establishes a new state of the art on three zero-shot compositional learning (ZSCL) benchmarks, outperforming the previous discriminative models by a large margin. Our model improves the performance of the prior arts by over 2x in the generalized ZSCL setting.},
 archiveprefix = {arXiv},
 author = {Xin Wang and Fisher Yu and Trevor Darrell and Joseph E. Gonzalez},
 bdsk-url-1 = {https://arxiv.org/abs/1906.04854},
 date-modified = {2020-08-02 11:27:35 -0700},
 eprint = {1906.04854},
 journal = {CoRR},
 keywords = {arxivpre},
 primaryclass = {cs.CV},
 title = {Task-Aware Feature Generation for Zero-Shot Compositional Learning},
 url = {https://arxiv.org/abs/1906.04854},
 year = {2019}
}

Rolando Garcia, Vikram Sreekanti, Neeraja Yadwadkar, Daniel Crankshaw, Joseph E. Gonzalez, and Joseph M. Hellerstein. "Context: The Missing Piece in the Machine Learning Lifecycle." Proceedings of the KDD Workshop on Common Model Infrastructure (CMI), 2018.

Machine learning models have become ubiquitous in modern applications. The ML Lifecycle describes a three-phase process used by data scientists and data engineers to develop, train, and serve models. Unfortunately, context around the data, code, people, and systems involved in these pipelines is not captured today. In this paper, we first discuss common pitfalls that missing context creates. Some examples where context is missing include tracking the relationships between code and data and capturing experimental processes over time. We then discuss techniques to address these challenges and briefly mention future work around designing and implementing systems in this space.

@inproceedings{Flor18,
 abstract = {Machine learning models have become ubiquitous in modern applications. The ML Lifecycle describes a three-phase process used by data scientists and data engineers to develop, train, and serve models. Unfortunately, context around the data, code, people, and systems involved in these pipelines is not captured today. In this paper, we first discuss common pitfalls that missing context creates. Some examples where context is missing include tracking the relationships between code and data and capturing experimental processes over time. We then discuss techniques to address these challenges and briefly mention future work around designing and implementing systems in this space.},
 author = {Rolando Garcia and Vikram Sreekanti and Neeraja Yadwadkar and Daniel Crankshaw and Joseph E. Gonzalez and Joseph M. Hellerstein},
 bdsk-url-1 = {http://www.vikrams.io/papers/flor-cmi18.pdf},
 booktitle = {Proceedings of the KDD Workshop on Common Model Infrastructure (CMI)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {8},
 title = {Context: The Missing Piece in the Machine Learning Lifecycle},
 url = {http://www.vikrams.io/papers/flor-cmi18.pdf},
 year = {2018}
}

Xin Wang, Yujia Luo, Dan Crankshaw, Alexey Tumanov, Fisher Yu, and Joseph E. Gonzalez. "IDK Cascades: Fast Deep Learning by Learning not to Overthink." Conference on Uncertainty in Artificial Intelligence (UAI), 2018.

Advances in deep learning have led to substantial increases in prediction accuracy but have been accompanied by increases in the cost of rendering predictions. We conjecture that fora majority of real-world inputs, the recent advances in deep learning have created models that effectively ``overthink'' on simple inputs. In this paper, we revisit the classic question of building model cascades that primarily leverage class asymmetry to reduce cost. We introduce the ``I Don't Know''(IDK) prediction cascades framework, a general framework to systematically compose a set of pre-trained models to accelerate inference without a loss in prediction accuracy. We propose two search based methods for constructing cascades as well as a new cost-aware objective within this framework. The proposed IDK cascade framework can be easily adopted in the existing model serving systems without additional model re-training. We evaluate the proposed techniques on a range of benchmarks to demonstrate the effectiveness of the proposed framework.

@inproceedings{IDK18,
 abstract = {Advances in deep learning have led to substantial increases in prediction accuracy but have been accompanied by increases in the cost of rendering predictions. We conjecture that fora majority of real-world inputs, the recent advances in deep learning have created models that effectively ``overthink'' on simple inputs. In this paper, we revisit the classic question of building model cascades that primarily leverage class asymmetry to reduce cost. We introduce the ``I Don't Know''(IDK) prediction cascades framework, a general framework to systematically compose a set of pre-trained models to accelerate inference without a loss in prediction accuracy. We propose two search based methods for constructing cascades as well as a new cost-aware objective within this framework. The proposed IDK cascade framework can be easily adopted in the existing model serving systems without additional model re-training. We evaluate the proposed techniques on a range of benchmarks to demonstrate the effectiveness of the proposed framework.},
 author = {Xin Wang and Yujia Luo and Dan Crankshaw and Alexey Tumanov and Fisher Yu and Joseph E. Gonzalez},
 bdsk-url-1 = {https://arxiv.org/abs/1706.00885},
 booktitle = {Conference on Uncertainty in Artificial Intelligence (UAI)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {7},
 title = { {IDK} Cascades: Fast Deep Learning by Learning not to Overthink},
 url = {https://arxiv.org/abs/1706.00885},
 year = {2018}
}

Eric Liang, Richard Liaw, Robert Nishihara, Philipp Moritz, Roy Fox, Joseph Gonzalez, Ken Goldberg, and Ion Stoica. "Ray RLLib: A Composable and Scalable Reinforcement Learning Library." Proceedings of the 35th International Conference on Machine Learning, 2018.

Reinforcement learning (RL) algorithms involve the deep nesting of highly irregular computation patterns, each of which typically exhibits opportunities for distributed computation. We argue for distributing RL components in a composable way by adapting algorithms for top-down hierarchical control, thereby encapsulating parallelism and resource requirements within short-running compute tasks. We demonstrate the benefits of this principle through RLlib: a library that provides scalable software primitives for RL. These primitives enable a broad range of algorithms to be implemented with high performance, scalability, and substantial code reuse.

@inproceedings{rllibicml2018,
 abstract = {Reinforcement learning (RL) algorithms involve the deep nesting of highly irregular computation patterns, each of which typically exhibits opportunities for distributed computation. We argue for distributing RL components in a composable way by adapting algorithms for top-down hierarchical control, thereby encapsulating parallelism and resource requirements within short-running compute tasks. We demonstrate the benefits of this principle through RLlib: a library that provides scalable software primitives for RL. These primitives enable a broad range of algorithms to be implemented with high performance, scalability, and substantial code reuse.},
 author = {Eric Liang and Richard Liaw and Robert Nishihara and Philipp Moritz and Roy Fox and Joseph Gonzalez and Ken Goldberg and Ion Stoica},
 bdsk-url-1 = {https://arxiv.org/abs/1712.09381},
 booktitle = {Proceedings of the 35th International Conference on Machine Learning},
 code = {https://ray.readthedocs.io/en/latest/rllib.html},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev, selected},
 month = {7},
 publisher = {ACM},
 series = {ICML '18},
 title = {Ray {RLLib}: {A} Composable and Scalable Reinforcement Learning Library},
 url = {https://arxiv.org/abs/1712.09381},
 year = {2018}
}

Dan Crankshaw, Joseph E. Gonzalez, and Peter Bailis. "Research for Practice: Prediction-Serving Systems." Commun. ACM, 2018.

What happens when we wish to actually deploy a machine learning model to production? This survey examines several recent systems for serving machine learning models as well as some classic papers describing early efforts in prediction serving.

@article{acmqueue2018,
 abstract = {What happens when we wish to actually deploy a machine learning model to production? This survey examines several recent systems for serving machine learning models as well as some classic papers describing early efforts in prediction serving.},
 acmid = {3190574},
 address = {New York, NY, USA},
 author = {Dan Crankshaw and Joseph E. Gonzalez and Peter Bailis},
 bdsk-url-1 = {http://doi.acm.org/10.1145/3190574},
 date-modified = {2020-08-02 11:27:35 -0700},
 issn = {0001-0782},
 issue_date = {August 2018},
 journal = {Commun. ACM},
 keywords = {techreport},
 month = {7},
 number = {8},
 numpages = {5},
 pages = {45--49},
 publisher = {ACM},
 title = {Research for Practice: Prediction-Serving Systems},
 url = {http://doi.acm.org/10.1145/3190574},
 volume = {61},
 year = {2018}
}

Xin Wang, Fisher Yu, Zi-Yi Dou, and Joseph E. Gonzalez. "SkipNet: Learning Dynamic Routing in Convolutional Networks." Proceedings of the European Conference on Computer Vision (ECCV), 2018.

While deeper convolutional networks are needed to achieve maximum accuracy in visual perception tasks, for many inputs shallower networks are sufficient. We exploit this observation by learning to skip convolutional layers on a per-input basis. We introduce SkipNet, a modified residual network, that uses a gating network to selectively skip convolutional blocks based on the activations of the previous layer. We formulate the dynamic skipping problem in the context of sequential decision making and propose a hybrid learning algorithm that combines supervised learning and reinforcement learning to address the challenges of non-differentiable skipping decisions. We show SkipNet reduces computation by 30-90\% while preserving the accuracy of the original model on four benchmark datasets and outperforms the state-of-the-art dynamic networks and static compression methods. We also qualitatively evaluate the gating policy to reveal a relationship between image scale and saliency and the number of layers skipped.

@inproceedings{Skipnet18,
 abstract = {While deeper convolutional networks are needed to achieve maximum accuracy in visual perception tasks, for many inputs shallower networks are sufficient. We exploit this observation by learning to skip convolutional layers on a per-input basis. We introduce SkipNet, a modified residual network, that uses a gating network to selectively skip convolutional blocks based on the activations of the previous layer. We formulate the dynamic skipping problem in the context of sequential decision making and propose a hybrid learning algorithm that combines supervised learning and reinforcement learning to address the challenges of non-differentiable skipping decisions. We show SkipNet reduces computation by 30-90\% while preserving the accuracy of the original model on four benchmark datasets and outperforms the state-of-the-art dynamic networks and static compression methods. We also qualitatively evaluate the gating policy to reveal a relationship between image scale and saliency and the number of layers skipped.},
 author = {Xin Wang and Fisher Yu and Zi{-}Yi Dou and Joseph E. Gonzalez},
 bdsk-url-1 = {https://arxiv.org/abs/1711.09485},
 booktitle = {Proceedings of the European Conference on Computer Vision ({ECCV})},
 code = {https://github.com/ucbdrive/skipnet},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev, selected},
 month = {7},
 title = { {SkipNet}: Learning Dynamic Routing in Convolutional Networks},
 url = {https://arxiv.org/abs/1711.09485},
 year = {2018}
}

Bichen Wu, Alvin Wan, Xiangyu Yue, Peter Jin, Sicheng Zhao, Noah Golmant, Amir Gholaminejad, Joseph E. Gonzalez, and Kurt Keutzer. "Shift: A Zero FLOP, Zero Parameter Alternative to Spatial Convolutions." The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2018.

Neural networks rely on convolutions to aggregate spatial information. However, spatial convolutions are expensive in terms of model size and computation, both of which grow quadratically with respect to kernel size. In this paper, we present a parameter-free, FLOP-free ``shift'' operation as an alternative to spatial convolutions. We fuse shifts and point-wise convolutions to construct end-to-end trainable shift-based modules, with a hyperparameter characterizing the tradeoff between accuracy and efficiency. To demonstrate the operation's efficacy, we replace ResNet's 3x3 convolutions with shift-based modules for improved CIFAR10 and CIFAR100 accuracy using 60\% fewer parameters; we additionally demonstrate the operation's resilience to parameter reduction on ImageNet, outperforming ResNet family members. We finally show the shift operation's applicability across domains, achieving strong performance with fewer parameters on classification, face verification and style transfer.

@inproceedings{Shift18,
 abstract = {Neural networks rely on convolutions to aggregate spatial information. However, spatial convolutions are expensive in terms of model size and computation, both of which grow quadratically with respect to kernel size. In this paper, we present a parameter-free, FLOP-free ``shift'' operation as an alternative to spatial convolutions. We fuse shifts and point-wise convolutions to construct end-to-end trainable shift-based modules, with a hyperparameter characterizing the tradeoff between accuracy and efficiency. To demonstrate the operation's efficacy, we replace ResNet's 3x3 convolutions with shift-based modules for improved CIFAR10 and CIFAR100 accuracy using 60\% fewer parameters; we additionally demonstrate the operation's resilience to parameter reduction on ImageNet, outperforming ResNet family members. We finally show the shift operation's applicability across domains, achieving strong performance with fewer parameters on classification, face verification and style transfer.},
 author = {Bichen Wu and Alvin Wan and Xiangyu Yue and Peter Jin and Sicheng Zhao and Noah Golmant and Amir Gholaminejad and Joseph E. Gonzalez and Kurt Keutzer},
 bdsk-url-1 = {https://arxiv.org/abs/1711.08141},
 booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {6},
 title = {Shift: A Zero {FLOP}, Zero Parameter Alternative to Spatial Convolutions},
 url = {https://arxiv.org/abs/1711.08141},
 year = {2018}
}

Samvit Jain, and Joseph E. Gonzalez. "Fast Semantic Segmentation on Video Using Block Motion-Based Feature Interpolation." The Third International Workshop on Video Segmentation (IWVS), 2018.

Convolutional networks optimized for accuracy on challenging, dense prediction tasks are prohibitively slow to run on each frame in a video. The spatial similarity of nearby video frames, however, suggests opportunity to reuse computation. Existing work has explored basic feature reuse and feature warping based on optical flow, but has encountered limits to the speedup attainable with these techniques. In this paper, we present a new, two part approach to accelerating inference on video. First, we propose a fast feature propagation technique that utilizes the block motion vectors present in compressed video (e.g. H.264 codecs) to cheaply propagate features from frame to frame. Second, we develop a novel feature estimation scheme, termed feature interpolation, that fuses features propagated from enclosing keyframes to render accurate feature estimates, even at sparse keyframe frequencies. We evaluate our system on the Cityscapes and CamVid datasets, comparing to both a frame-by-frame baseline and related work. We find that we are able to substantially accelerate segmentation on video, achieving near real-time frame rates (20.1 frames per second) on large images (960 x 720 pixels), while maintaining competitive accuracy. This represents an improvement of almost 6x over the single-frame baseline and 2.5x over the fastest prior work.

@inproceedings{Jain18IWVS,
 abstract = {Convolutional networks optimized for accuracy on challenging, dense prediction tasks are prohibitively slow to run on each frame in a video. The spatial similarity of nearby video frames, however, suggests opportunity to reuse computation. Existing work has explored basic feature reuse and feature warping based on optical flow, but has encountered limits to the speedup attainable with these techniques. In this paper, we present a new, two part approach to accelerating inference on video. First, we propose a fast feature propagation technique that utilizes the block motion vectors present in compressed video (e.g. H.264 codecs) to cheaply propagate features from frame to frame. Second, we develop a novel feature estimation scheme, termed feature interpolation, that fuses features propagated from enclosing keyframes to render accurate feature estimates, even at sparse keyframe frequencies. We evaluate our system on the Cityscapes and CamVid datasets, comparing to both a frame-by-frame baseline and related work. We find that we are able to substantially accelerate segmentation on video, achieving near real-time frame rates (20.1 frames per second) on large images (960 x 720 pixels), while maintaining competitive accuracy. This represents an improvement of almost 6x over the single-frame baseline and 2.5x over the fastest prior work.},
 author = {Samvit Jain and Joseph E. Gonzalez},
 bdsk-url-1 = {https://arxiv.org/abs/1803.07742},
 booktitle = {The Third International Workshop on Video Segmentation (IWVS)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {3},
 title = {Fast Semantic Segmentation on Video Using Block Motion-Based Feature Interpolation},
 url = {https://arxiv.org/abs/1803.07742},
 year = {2018}
}

Sicheng Zhao, Bichen Wu, Joseph Gonzalez, Sanjit A. Seshia, and Kurt Keutzer. "Unsupervised Domain Adaptation: from Simulation Engine to the RealWorld." CoRR (arXiv), 2018.

Large-scale labeled training datasets have enabled deep neural networks to excel on a wide range of benchmark vision tasks. However, in many applications it is prohibitively expensive or time-consuming to obtain large quantities of labeled data. To cope with limited labeled training data, many have attempted to directly apply models trained on a large-scale labeled source domain to another sparsely labeled target domain. Unfortunately, direct transfer across domains often performs poorly due to domain shift and dataset bias. Domain adaptation is the machine learning paradigm that aims to learn a model from a source domain that can perform well on a different (but related) target domain. In this paper, we summarize and compare the latest unsupervised domain adaptation methods in computer vision applications. We classify the non-deep approaches into sample re-weighting and intermediate subspace transformation categories, while the deep strategy includes discrepancy-based methods, adversarial generative models, adversarial discriminative models and reconstruction-based methods. We also discuss some potential directions.

@article{Zhao2018,
 abstract = {Large-scale labeled training datasets have enabled deep neural networks to excel on a wide range of benchmark vision tasks. However, in many applications it is prohibitively expensive or time-consuming to obtain large quantities of labeled data. To cope with limited labeled training data, many have attempted to directly apply models trained on a large-scale labeled source domain to another sparsely labeled target domain. Unfortunately, direct transfer across domains often performs poorly due to domain shift and dataset bias. Domain adaptation is the machine learning paradigm that aims to learn a model from a source domain that can perform well on a different (but related) target domain. In this paper, we summarize and compare the latest unsupervised domain adaptation methods in computer vision applications. We classify the non-deep approaches into sample re-weighting and intermediate subspace transformation categories, while the deep strategy includes discrepancy-based methods, adversarial generative models, adversarial discriminative models and reconstruction-based methods. We also discuss some potential directions.},
 archiveprefix = {arXiv},
 author = {Sicheng Zhao and Bichen Wu and Joseph Gonzalez and Sanjit A. Seshia and Kurt Keutzer},
 bdsk-url-1 = {http://arxiv.org/abs/1803.09180},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/bib/journals/corr/abs-1803-09180},
 date-modified = {2020-08-02 11:27:35 -0700},
 eprint = {1803.09180},
 journal = {CoRR},
 keywords = {arxivpre},
 month = {3},
 title = {Unsupervised Domain Adaptation: from Simulation Engine to the RealWorld},
 url = {http://arxiv.org/abs/1803.09180},
 volume = {abs/1803.09180},
 year = {2018}
}

Vladimir Feinberg, Alvin Wan, Ion Stoica, Michael I. Jordan, Joseph E. Gonzalez, and Sergey Levine. "Model-Based Value Estimation for Efficient Model-Free Reinforcement Learning." CoRR (arXiv), 2018.

Recent model-free reinforcement learning algorithms have proposed incorporating learned dynamics models as a source of additional data with the intention of reducing sample complexity. Such methods hold the promise of incorporating imagined data coupled with a notion of model uncertainty to accelerate the learning of continuous control tasks. Unfortunately, they rely on heuristics that limit usage of the dynamics model. We present model-based value expansion, which controls for uncertainty in the model by only allowing imagination to fixed depth. By enabling wider use of learned dynamics models within a model-free reinforcement learning algorithm, we improve value estimation, which, in turn, reduces the sample complexity of learning.

@article{Feinberg2018,
 abstract = {Recent model-free reinforcement learning algorithms have proposed incorporating learned dynamics models as a source of additional data with the intention of reducing sample complexity. Such methods hold the promise of incorporating imagined data coupled with a notion of model uncertainty to accelerate the learning of continuous control tasks. Unfortunately, they rely on heuristics that limit usage of the dynamics model. We present model-based value expansion, which controls for uncertainty in the model by only allowing imagination to fixed depth. By enabling wider use of learned dynamics models within a model-free reinforcement learning algorithm, we improve value estimation, which, in turn, reduces the sample complexity of learning.},
 archiveprefix = {arXiv},
 author = {Vladimir Feinberg and Alvin Wan and Ion Stoica and Michael I. Jordan and Joseph E. Gonzalez and Sergey Levine},
 bdsk-url-1 = {http://arxiv.org/abs/1803.00101},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/bib/journals/corr/abs-1803-00101},
 date-modified = {2020-08-02 11:27:35 -0700},
 eprint = {1803.00101},
 journal = {CoRR},
 keywords = {arxivpre},
 month = {2},
 title = {Model-Based Value Estimation for Efficient Model-Free Reinforcement Learning},
 url = {http://arxiv.org/abs/1803.00101},
 volume = {abs/1803.00101},
 year = {2018}
}

Xiangxi Mo, Paras Jain, Ajay Jain, Alexey Tumanov, Joseph E. Gonzalez, and Ion Stoica. "A Case for Dynamic GPU Inference Multitenancy and Scheduling." Proceedings of the Learning Systems Workshop at NIPS 2018, 2018.

Serving deep neural networks in latency critical interactive settings often requires GPU acceleration. However, the small batch sizes typical in online inference results in poor GPU utilization, a potential performance gap which GPU resource sharing can address. In this paper, we explore several techniques to leverage both temporal and spatial multiplexing to improve GPU utilization for deep learning inference workloads. We evaluate the performance trade-offs of each approach with respect to resource-efficiency, latency predictability, and isolation when compared with conventional batched inference. Our experimental analysis suggests at least a 5x potential for improved utilization through the exploration of more advanced spatial and temporal multiplexing strategies. Our preliminary prototype of a dynamic space-time scheduler demonstrates a 3.18x speedup over space-only multiplexing and a 7.76x speedup over time-only multiplexing, while also providing better isolation and latency predictability.

@inproceedings{LearningSys2018,
 abstract = {Serving deep neural networks in latency critical interactive settings often requires GPU acceleration. However, the small batch sizes typical in online inference results in poor GPU utilization, a potential performance gap which GPU resource sharing can address. In this paper, we explore several techniques to leverage both temporal and spatial multiplexing to improve GPU utilization for deep learning inference workloads. We evaluate the performance trade-offs of each approach with respect to resource-efficiency, latency predictability, and isolation when compared with conventional batched inference. Our experimental analysis suggests at least a 5x potential for improved utilization through the exploration of more advanced spatial and temporal multiplexing strategies. Our preliminary prototype of a dynamic space-time scheduler demonstrates a 3.18x speedup over space-only multiplexing and a 7.76x speedup over time-only multiplexing, while also providing better isolation and latency predictability.},
 author = {Xiangxi Mo and Paras Jain and Ajay Jain and Alexey Tumanov and Joseph E. Gonzalez and Ion Stoica},
 bdsk-url-1 = {http://learningsys.org/nips18/assets/papers/102CameraReadySubmissionGPU%5C%5FVirtualization%5C%20(8).pdf},
 booktitle = {Proceedings of the Learning Systems Workshop at NIPS 2018},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {12},
 title = {A Case for Dynamic GPU Inference Multitenancy and Scheduling},
 url = {http://learningsys.org/nips18/assets/papers/102CameraReadySubmissionGPU\%5FVirtualization\%20(8).pdf},
 year = {2018}
}

J. Weston Hughes, Taylor Sittler, Anthony D. Joseph, Jeffrey E. Olgin, Joseph E. Gonzalez, and Geoffrey H. Tison. "Using Multitask Learning to Improve 12-Lead Electrocardiogram Classification." CoRR (arXiv), 2018.

We develop a multi-task convolutional neural network (CNN) to classify multiple diagnoses from 12-lead electrocardiograms (ECGs) using a dataset comprised of over 40,000 ECGs, with labels derived from cardiologist clinical interpretations. Since many clinically important classes can occur in low frequencies, approaches are needed to improve performance on rare classes. We compare the performance of several single-class classifiers on rare classes to a multi-headed classifier across all available classes. We demonstrate that the addition of common classes can significantly improve CNN performance on rarer classes when compared to a model trained on the rarer class in isolation. Using this method, we develop a model with high performance as measured by F1 score on multiple clinically relevant classes compared against the gold-standard cardiologist interpretation.

@article{Hughes18,
 abstract = {We develop a multi-task convolutional neural network (CNN) to classify multiple diagnoses from 12-lead electrocardiograms (ECGs) using a dataset comprised of over 40,000 ECGs, with labels derived from cardiologist clinical interpretations. Since many clinically important classes can occur in low frequencies, approaches are needed to improve performance on rare classes. We compare the performance of several single-class classifiers on rare classes to a multi-headed classifier across all available classes. We demonstrate that the addition of common classes can significantly improve CNN performance on rarer classes when compared to a model trained on the rarer class in isolation. Using this method, we develop a model with high performance as measured by F1 score on multiple clinically relevant classes compared against the gold-standard cardiologist interpretation.},
 archiveprefix = {arXiv},
 author = {J. Weston Hughes and Taylor Sittler and Anthony D. Joseph and Jeffrey E. Olgin and Joseph E. Gonzalez and Geoffrey H. Tison},
 bdsk-url-1 = {http://arxiv.org/abs/1812.00497},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/bib/journals/corr/abs-1812-00497},
 date-modified = {2020-08-02 11:27:35 -0700},
 eprint = {1812.00497},
 journal = {CoRR},
 keywords = {arxivpre},
 month = {12},
 title = {Using Multitask Learning to Improve 12-Lead Electrocardiogram Classification},
 url = {http://arxiv.org/abs/1812.00497},
 volume = {abs/1812.00497},
 year = {2018}
}

Noah Golmant, Nikita Vemuri, Zhewei Yao, Vladimir Feinberg, Amir Gholami, Kai Rothauge, Michael W. Mahoney, and Joseph Gonzalez. "On the Computational Inefficiency of Large Batch Sizes for Stochastic Gradient Descent." CoRR (arXiv), 2018.

Increasing the mini-batch size for stochastic gradient descent offers significant opportunities to reduce wall-clock training time, but there are a variety of theoretical and systems challenges that impede the widespread success of this technique. We investigate these issues, with an emphasis on time to convergence and total computational cost, through an extensive empirical analysis of network training across several architectures and problem domains, including image classification, image segmentation, and language modeling. Although it is common practice to increase the batch size in order to fully exploit available computational resources, we find a substantially more nuanced picture. Our main finding is that across a wide range of network architectures and problem domains, increasing the batch size beyond a certain point yields no decrease in wall-clock time to convergence for \emph{either} train or test loss. This batch size is usually substantially below the capacity of current systems. We show that popular training strategies for large batch size optimization begin to fail before we can populate all available compute resources, and we show that the point at which these methods break down depends more on attributes like model architecture and data complexity than it does directly on the size of the dataset.

@article{Golmant18,
 abstract = {Increasing the mini-batch size for stochastic gradient descent offers significant opportunities to reduce wall-clock training time, but there are a variety of theoretical and systems challenges that impede the widespread success of this technique. We investigate these issues, with an emphasis on time to convergence and total computational cost, through an extensive empirical analysis of network training across several architectures and problem domains, including image classification, image segmentation, and language modeling. Although it is common practice to increase the batch size in order to fully exploit available computational resources, we find a substantially more nuanced picture. Our main finding is that across a wide range of network architectures and problem domains, increasing the batch size beyond a certain point yields no decrease in wall-clock time to convergence for \emph{either} train or test loss. This batch size is usually substantially below the capacity of current systems. We show that popular training strategies for large batch size optimization begin to fail before we can populate all available compute resources, and we show that the point at which these methods break down depends more on attributes like model architecture and data complexity than it does directly on the size of the dataset.},
 archiveprefix = {arXiv},
 author = {Noah Golmant and Nikita Vemuri and Zhewei Yao and Vladimir Feinberg and Amir Gholami and Kai Rothauge and Michael W. Mahoney and Joseph Gonzalez},
 bdsk-url-1 = {http://arxiv.org/abs/1811.12941},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/bib/journals/corr/abs-1811-12941},
 date-modified = {2020-08-02 11:27:35 -0700},
 eprint = {1811.12941},
 journal = {CoRR},
 keywords = {arxivpre},
 month = {11},
 title = {On the Computational Inefficiency of Large Batch Sizes for Stochastic Gradient Descent},
 url = {http://arxiv.org/abs/1811.12941},
 volume = {abs/1811.12941},
 year = {2018}
}

Richard Liaw, Eric Liang, Robert Nishihara, Philipp Moritz, Joseph E. Gonzalez, and Ion Stoica. "Tune: A Research Platform for Distributed Model Selection and Training." Proceedings of the ICML Workshop on AutoML, 2018.

Modern machine learning algorithms are increasingly computationally demanding, requiring specialized hardware and distributed computation to achieve high performance in a reasonable time frame. Many hyperparameter search algorithms have been proposed for improving the efficiency of model selection, however their adaptation to the distributed compute environment is often ad-hoc. We propose Tune, a unified framework for model selection and training that provides a narrow-waist interface between training scripts and search algorithms. We show that this interface meets the requirements for a broad range of hyperparameter search algorithms, allows straightforward scaling of search to large clusters, and simplifies algorithm implementation. We demonstrate the implementation of several state-of-the-art hyperparameter search algorithms in Tune.

@inproceedings{Tune18,
 abstract = {Modern machine learning algorithms are increasingly computationally demanding, requiring specialized hardware and distributed computation to achieve high performance in a reasonable time frame. Many hyperparameter search algorithms have been proposed for improving the efficiency of model selection, however their adaptation to the distributed compute environment is often ad-hoc. We propose Tune, a unified framework for model selection and training that provides a narrow-waist interface between training scripts and search algorithms. We show that this interface meets the requirements for a broad range of hyperparameter search algorithms, allows straightforward scaling of search to large clusters, and simplifies algorithm implementation. We demonstrate the implementation of several state-of-the-art hyperparameter search algorithms in Tune.},
 author = {Richard Liaw and Eric Liang and Robert Nishihara and Philipp Moritz and Joseph E. Gonzalez and Ion Stoica},
 bdsk-url-1 = {https://arxiv.org/abs/1807.05118},
 booktitle = {Proceedings of the ICML Workshop on AutoML},
 code = {https://ray.readthedocs.io/en/latest/tune.html},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 title = {Tune: A Research Platform for Distributed Model Selection and Training},
 url = {https://arxiv.org/abs/1807.05118},
 year = {2018}
}

Ion Stoica, Dawn Song, Raluca Ada Popa, David A. Patterson, Michael W. Mahoney, Randy H. Katz, Anthony D. Joseph, Michael Jordan, Joseph M. Hellerstein, Joseph E. Gonzalez, Ken Goldberg, Ali Ghodsi, David E. Culler, and Pieter Abbeel. "A Berkeley View of Systems Challenges for AI." EECS Department, University of California, Berkeley Technical Report, 2017.

With the increasing commoditization of computer vision, speech recognition and machine translation systems and the widespread deployment of learning-based back-end technologies such as digital advertising and intelligent infrastructures, AI (Artificial Intelligence) has moved from research labs to production. These changes have been made possible by unprecedented levels of data and computation, by methodological advances in machine learning, by innovations in systems software and architectures, and by the broad accessibility of these technologies. The next generation of AI systems promises to accelerate these developments and increasingly impact our lives via frequent interactions and making (often mission-critical) decisions on our behalf, often in highly personalized contexts. Realizing this promise, however, raises daunting challenges. In particular, we need AI systems that make timely and safe decisions in unpredictable environments, that are robust against sophisticated adversaries, and that can process ever increasing amounts of data across organizations and individuals without compromising confidentiality. These challenges will be exacerbated by the end of the Moore's Law, which will constrain the amount of data these technologies can store and process. In this paper, we propose several open research directions in systems, architectures, and security that can address these challenges and help unlock AI's potential to improve lives and society.

@techreport{Stoica17,
 abstract = {
With the increasing commoditization of computer vision, speech recognition and machine translation systems and the widespread deployment of learning-based back-end technologies such as digital advertising and intelligent infrastructures, AI (Artificial Intelligence) has moved from research labs to production. These changes have been made possible by unprecedented levels of data and computation, by methodological advances in machine learning, by innovations in systems software and architectures, and by the broad accessibility of these technologies.

The next generation of AI systems promises to accelerate these developments and increasingly impact our lives via frequent interactions and making (often mission-critical) decisions on our behalf, often in highly personalized contexts. Realizing this promise, however, raises daunting challenges. In particular, we need AI systems that make timely and safe decisions in unpredictable environments, that are robust against sophisticated adversaries, and that can process ever increasing amounts of data across organizations and individuals without compromising confidentiality. These challenges will be exacerbated by the end of the Moore's Law, which will constrain the amount of data these technologies can store and process. In this paper, we propose several open research directions in systems, architectures, and security that can address these challenges and help unlock AI's potential to improve lives and society.
},
 author = {Ion Stoica and Dawn Song and Raluca Ada Popa and David A. Patterson and Michael W. Mahoney and Randy H. Katz and Anthony D. Joseph and Michael Jordan and Joseph M. Hellerstein and Joseph E. Gonzalez and Ken Goldberg and Ali Ghodsi and David E. Culler and Pieter Abbeel},
 bdsk-url-1 = {http://www2.eecs.berkeley.edu/Pubs/TechRpts/2017/EECS-2017-159.html},
 date-modified = {2020-08-02 11:27:35 -0700},
 institution = {EECS Department, University of California, Berkeley},
 keywords = {techreport},
 month = {9},
 number = {UCB/EECS-2017-159},
 title = {A Berkeley View of Systems Challenges for {AI} },
 url = {http://www2.eecs.berkeley.edu/Pubs/TechRpts/2017/EECS-2017-159.html},
 year = {2017}
}

Neeraja J. Yadwadkar, Bharath Hariharan, Joseph E. Gonzalez, Burton Smith, and Randy H. Katz. "Selecting the Best VM Across Multiple Public Clouds: A Data-driven Performance Modeling Approach." Proceedings of the 2017 Symposium on Cloud Computing, 2017.

Users of cloud services are presented with a bewildering choice of VM types and the choice of VM can have significant implications on performance and cost. In this paper we address the fundamental problem of accurately and economically choosing the best VM for a given workload and user goals. To address the problem of optimal VM selection, we present PARIS, a data-driven system that uses a novel hybrid offline and online data collection and modeling framework to provide accurate performance estimates with minimal data collection. PARIS is able to predict workload performance for different user-specified metrics, and resulting costs for a wide range of VM types and workloads across multiple cloud providers. When compared to sophisticated baselines, including collaborative filtering and a linear interpolation model using measured workload performance on two VM types, PARIS produces significantly better estimates of performance. For instance, it reduces runtime prediction error by a factor of 4 for some workloads on both AWS and Azure. The increased accuracy translates into a 45\% reduction in user cost while maintaining performance.

@inproceedings{Paris17,
 abstract = {Users of cloud services are presented with a bewildering choice of VM types and the choice of VM can have significant implications on performance and cost. In this paper we address the fundamental problem of accurately and economically choosing the best VM for a given workload and user goals. To address the problem of optimal VM selection, we present PARIS, a data-driven system that uses a novel hybrid offline and online data collection and modeling framework to provide accurate performance estimates with minimal data collection. PARIS is able to predict workload performance for different user-specified metrics, and resulting costs for a wide range of VM types and workloads across multiple cloud providers. When compared to sophisticated baselines, including collaborative filtering and a linear interpolation model using measured workload performance on two VM types, PARIS produces significantly better estimates of performance. For instance, it reduces runtime prediction error by a factor of 4 for some workloads on both AWS and Azure. The increased accuracy translates into a 45\% reduction in user cost while maintaining performance.},
 acmid = {3131614},
 author = {Neeraja J. Yadwadkar and Bharath Hariharan and Joseph E. Gonzalez and Burton Smith and Randy H. Katz},
 bdsk-url-1 = {https://doi.acm.org/10.1145/3127479.3131614},
 booktitle = {Proceedings of the 2017 Symposium on Cloud Computing},
 date-modified = {2020-08-02 11:27:35 -0700},
 isbn = {978-1-4503-5028-0},
 keywords = {peerrev},
 location = {Santa Clara, California},
 month = {9},
 numpages = {14},
 pages = {452--465},
 publisher = {ACM},
 series = { {SoCC} '17},
 title = {Selecting the Best {VM} Across Multiple Public Clouds: A Data-driven Performance Modeling Approach},
 url = {https://doi.acm.org/10.1145/3127479.3131614},
 year = {2017}
}

Francois W. Belletti, Evan R. Sparks, Michael J. Franklin, Alexandre M. Bayen, and Joseph E. Gonzalez. "Random Projection Design for Scalable Implicit Smoothing of Randomly Observed Stochastic Processes." Artificial Intelligence and Statistics (AIStats '17), 2017.

Sampling at random timestamps, long range dependencies, and scale hamper standard meth- ods for multivariate time series analysis. In this paper we present a novel estimator for cross-covariance of randomly observed time series which unravels the dynamics of an unobserved stochastic process. We analyze the statistical properties of our estimator without needing the assumption that observation timestamps are independent from the process of interest and show that our solution is not hindered by the issues affecting standard estimators for cross-covariance. We implement and evaluate our statistically sound and scalable approach in the distributed setting using Apache Spark and demonstrate its ability to unravel causal dynamics on both simulations and high-frequency financial trading data.

@inproceedings{aistats17,
 abstract = {Sampling at random timestamps, long range dependencies, and scale hamper standard meth- ods for multivariate time series analysis. In this paper we present a novel estimator for cross-covariance of randomly observed time series which unravels the dynamics of an unobserved stochastic process. We analyze the statistical properties of our estimator without needing the assumption that observation timestamps are independent from the process of interest and show that our solution is not hindered by the issues affecting standard estimators for cross-covariance. We implement and evaluate our statistically sound and scalable approach in the distributed setting using Apache Spark and demonstrate its ability to unravel causal dynamics on both simulations and high-frequency financial trading data.},
 author = {Francois W. Belletti and Evan R. Sparks and Michael J. Franklin and Alexandre M. Bayen and Joseph E. Gonzalez},
 bdsk-url-1 = {http://proceedings.mlr.press/v54/belletti17a/belletti17a.pdf},
 booktitle = {Artificial Intelligence and Statistics ({AIStats} '17)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {7},
 title = {Random Projection Design for Scalable Implicit Smoothing of Randomly Observed Stochastic Processes},
 url = {http://proceedings.mlr.press/v54/belletti17a/belletti17a.pdf},
 year = {2017}
}

Richard Liaw, Sanjay Krishnan, Animesh Garg, Daniel Crankshaw, Joseph E. Gonzalez, and Ken Goldberg. "Composing Meta-Policies for Autonomous Driving Using Hierarchical Deep Reinforcement Learning." CoRR (arXiv), 2017.

Rather than learning new control policies for each new task, it is possible, when tasks share some structure, to compose a ``meta-policy'' from previously learned policies. This paper reports results from experiments using Deep Reinforcement Learning on a continuous-state, discrete-action autonomous driving simulator. We explore how Deep Neural Networks can represent meta-policies that switch among a set of previously learned policies, specifically in settings where the dynamics of a new scenario are composed of a mixture of previously learned dynamics and where the state observation is possibly corrupted by sensing noise. We also report the results of experiments varying dynamics mixes, distractor policies, magnitudes/distributions of sensing noise, and obstacles. In a fully observed experiment, the meta-policy learning algorithm achieves 2.6x the reward achieved by the next best policy composition technique with 80\% less exploration. In a partially observed experiment, the meta-policy learning algorithm converges after 50 iterations while a direct application of RL fails to converge even after 200 iterations.

@article{Liaw2017,
 abstract = {Rather than learning new control policies for each new task, it is possible, when tasks share some structure, to compose a ``meta-policy'' from previously learned policies. This paper reports results from experiments using Deep Reinforcement Learning on a continuous-state, discrete-action autonomous driving simulator. We explore how Deep Neural Networks can represent meta-policies that switch among a set of previously learned policies, specifically in settings where the dynamics of a new scenario are composed of a mixture of previously learned dynamics and where the state observation is possibly corrupted by sensing noise. We also report the results of experiments varying dynamics mixes, distractor policies, magnitudes/distributions of sensing noise, and obstacles. In a fully observed experiment, the meta-policy learning algorithm achieves 2.6x the reward achieved by the next best policy composition technique with 80\% less exploration. In a partially observed experiment, the meta-policy learning algorithm converges after 50 iterations while a direct application of RL fails to converge even after 200 iterations.},
 archiveprefix = {arXiv},
 author = {Richard Liaw and Sanjay Krishnan and Animesh Garg and Daniel Crankshaw and Joseph E. Gonzalez and Ken Goldberg},
 bdsk-url-1 = {http://arxiv.org/abs/1711.01503},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/bib/journals/corr/abs-1711-01503},
 date-modified = {2020-08-02 11:27:35 -0700},
 eprint = {1711.01503},
 journal = {CoRR},
 keywords = {arxivpre},
 month = {11},
 title = {Composing Meta-Policies for Autonomous Driving Using Hierarchical Deep Reinforcement Learning},
 url = {http://arxiv.org/abs/1711.01503},
 volume = {abs/1711.01503},
 year = {2017}
}

Daniel Crankshaw, Xin Wang, Guilio Zhou, Michael J. Franklin, Joseph E. Gonzalez, and Ion Stoica. "Clipper: A Low-Latency Online Prediction Serving System." 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17), 2017.

Machine learning is being deployed in a growing number of applications which demand real-time, accurate, and robust predictions under heavy query load. However, most machine learning frameworks and systems only address model training and not deployment. In this paper, we introduce Clipper, a general-purpose low-latency prediction serving system. Interposing between end-user applications and a wide range of machine learning frameworks, Clipper introduces a modular architecture to simplify model deployment across frameworks and applications. Furthermore, by introducing caching, batching, and adaptive model selection techniques, Clipper reduces prediction latency and improves prediction throughput, accuracy, and robustness without modifying the underlying machine learning frameworks. We evaluate Clipper on four common machine learning benchmark datasets and demonstrate its ability to meet the latency, accuracy, and throughput demands of online serving applications. Finally, we compare Clipper to the Tensorflow Serving system and demonstrate that we are able to achieve comparable throughput and latency while enabling model composition and online learning to improve accuracy and render more robust predictions.

@inproceedings{Clipper17,
 abstract = {
Machine learning is being deployed in a growing number of applications which demand real-time, accurate, and robust predictions under heavy query load. However, most machine learning frameworks and systems only address model training and not deployment.

In this paper, we introduce Clipper, a general-purpose low-latency prediction serving system. Interposing between end-user applications and a wide range of machine learning frameworks, Clipper introduces a modular architecture to simplify model deployment across frameworks and applications. Furthermore, by introducing caching, batching, and adaptive model selection techniques, Clipper reduces prediction latency and improves prediction throughput, accuracy, and robustness without modifying the underlying machine learning frameworks. We evaluate Clipper on four common machine learning benchmark datasets and demonstrate its ability to meet the latency, accuracy, and throughput demands of online serving applications. Finally, we compare Clipper to the Tensorflow Serving system and demonstrate that we are able to achieve comparable throughput and latency while enabling model composition and online learning to improve accuracy and render more robust predictions.
},
 address = {Boston, MA},
 author = {Daniel Crankshaw and Xin Wang and Guilio Zhou and Michael J. Franklin and Joseph E. Gonzalez and Ion Stoica},
 bdsk-url-1 = {https://www.usenix.org/conference/nsdi17/technical-sessions/presentation/crankshaw},
 booktitle = {14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)},
 code = {https://clipper.ai},
 date-modified = {2020-08-02 11:27:35 -0700},
 isbn = {978-1-931971-37-9},
 keywords = {peerrev, selected},
 pages = {613--627},
 publisher = {USENIX Association},
 title = {Clipper: A Low-Latency Online Prediction Serving System},
 url = {https://www.usenix.org/conference/nsdi17/technical-sessions/presentation/crankshaw},
 year = {2017}
}

Joseph M. Hellerstein, Vikram Sreekanti, Joseph E. Gonzalez, Sudhansku Arora, Arka Bhattacharyya, Shirshanka Das, Akon Dey, Mark Donsky, Gabriel Fierro, Sreyashi Nag, Krishna Ramachandran, Chang She, Eric Sun, Carl Steinbach, and Venkat Subramanian. "Establishing Common Ground with Data Context." Conference on Innovative Data Systems Research (CIDR '17), 2017.


@inproceedings{Ground17,
 author = {Joseph M. Hellerstein and Vikram Sreekanti and Joseph E. Gonzalez and Sudhansku Arora and Arka Bhattacharyya and Shirshanka Das and Akon Dey and Mark Donsky and Gabriel Fierro and Sreyashi Nag and Krishna Ramachandran and Chang She and Eric Sun and Carl Steinbach and Venkat Subramanian},
 booktitle = {Conference on Innovative Data Systems Research ({CIDR} '17)},
 keywords = {peerrev},
 title = {Establishing Common Ground with Data Context},
 year = {2017}
}

Wenting Zheng, Ankur Dave, Jethro G. Beekman, Raluca Ada Popa, Joseph E. Gonzalez, and Ion Stoica. "Opaque: An Oblivious and Encrypted Distributed Analytics Platform." 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17), 2017.


@inproceedings{Opaque17,
 address = {Boston, MA},
 author = {Wenting Zheng and Ankur Dave and Jethro G. Beekman and Raluca Ada Popa and Joseph E. Gonzalez and Ion Stoica},
 bdsk-url-1 = {https://www.usenix.org/conference/nsdi17/technical-sessions/presentation/zheng},
 booktitle = {14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)},
 date-modified = {2020-08-02 11:27:35 -0700},
 isbn = {978-1-931971-37-9},
 keywords = {peerrev},
 pages = {283--298},
 publisher = {USENIX Association},
 title = {Opaque: An Oblivious and Encrypted Distributed Analytics Platform},
 url = {https://www.usenix.org/conference/nsdi17/technical-sessions/presentation/zheng},
 year = {2017}
}

Matei Zaharia, Reynold S. Xin, Patrick Wendell, Tathagata Das, Michael Armbrust, Ankur Dave, Xiangrui Meng, Josh Rosen, Shivaram Venkataraman, Michael J. Franklin, Ali Ghodsi, Joseph E. Gonzalez, Scott Shenker, and Ion Stoica. "Apache Spark: A Unified Engine for Big Data Processing." Commun. ACM, 2016.


@article{acmqueu2016,
 acmid = {2934664},
 address = {New York, NY, USA},
 author = {Matei Zaharia and Reynold S. Xin and Patrick Wendell and Tathagata Das and Michael Armbrust and Ankur Dave and Xiangrui Meng and Josh Rosen and Shivaram Venkataraman and Michael J. Franklin and Ali Ghodsi and Joseph E. Gonzalez and Scott Shenker and Ion Stoica},
 bdsk-url-1 = {https://doi.acm.org/10.1145/2934664},
 date-modified = {2020-08-02 11:27:35 -0700},
 issn = {0001-0782},
 issue_date = {November 2016},
 journal = {Commun. ACM},
 keywords = {techreport},
 month = {9},
 number = {11},
 numpages = {10},
 pages = {56--65},
 publisher = {ACM},
 title = {Apache Spark: A Unified Engine for Big Data Processing},
 url = {https://doi.acm.org/10.1145/2934664},
 volume = {59},
 year = {2016}
}

Rong Gu, Qianhao Dong, Haoyuan Li, Joseph E. Gonzalez, Zhao Zhang, Shuai Wang, Yihua Huang, Scott Shenker, Ion Stoica, and Patrick P. C. Lee. "DFS-Perf: A Scalable and Unified Benchmarking Framework for Distributed File Systems." EECS Department, University of California, Berkeley Technical Report, 2016.


@techreport{Rong2016,
 author = {Rong Gu and Qianhao Dong and Haoyuan Li and Joseph E. Gonzalez and Zhao Zhang and Shuai Wang and Yihua Huang and Scott Shenker and Ion Stoica and Patrick P. C. Lee},
 bdsk-url-1 = {http://www2.eecs.berkeley.edu/Pubs/TechRpts/2016/EECS-2016-133.html},
 date-modified = {2020-08-02 11:27:35 -0700},
 institution = {EECS Department, University of California, Berkeley},
 keywords = {techreport},
 month = {7},
 number = {UCB/EECS-2016-133},
 title = {DFS-Perf: A Scalable and Unified Benchmarking Framework for Distributed File Systems},
 url = {http://www2.eecs.berkeley.edu/Pubs/TechRpts/2016/EECS-2016-133.html},
 year = {2016}
}

Ankur Dave, Alekh Jindal, Li Erran Li, Reynold Xin, Joseph E. Gonzalez, and Matei Zaharia. "GraphFrames: An Integrated API for Mixing Graph and Relational Queries.." SIGMOD Grades Workshop, 2016.


@inproceedings{Graphframes16,
 author = {Ankur Dave and Alekh Jindal and Li Erran Li and Reynold Xin and Joseph E. Gonzalez and Matei Zaharia},
 booktitle = { {SIGMOD} Grades Workshop},
 keywords = {peerrev},
 title = {GraphFrames: An Integrated API for Mixing Graph and Relational Queries.},
 year = {2016}
}

Neeraja J. Yadwadkar, Bharath Hariharan, Joseph E. Gonzalez, and Randy Katz. "Multi-Task Learning for Straggler Avoiding Predictive Job Scheduling." Journal of Machine Learning Research (JMLR '16), 2016.


@inproceedings{Mtl16,
 author = {Neeraja J. Yadwadkar and Bharath Hariharan and Joseph E. Gonzalez and Randy Katz},
 booktitle = {Journal of Machine Learning Research ({JMLR} '16)},
 keywords = {peerrev},
 title = {Multi-Task Learning for Straggler Avoiding Predictive Job Scheduling},
 year = {2016}
}

Francois W. Belletti, Evan R. Sparks, Michael J. Franklin, Alexandre M. Bayen, and Joseph E. Gonzalez. "Scalable Linear Causal Inference for Irregularly Sampled Time Series with Long Range Dependencies." CoRR (arXiv), 2016.


@article{BellettiSFBG16,
 archiveprefix = {arXiv},
 author = {Francois W. Belletti and Evan R. Sparks and Michael J. Franklin and Alexandre M. Bayen and Joseph E. Gonzalez},
 bdsk-url-1 = {http://arxiv.org/abs/1603.03336},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/bib/journals/corr/BellettiSFBG16},
 date-modified = {2020-08-02 11:27:35 -0700},
 eprint = {1603.03336},
 journal = {CoRR},
 keywords = {arxivpre},
 timestamp = {Mon, 13 Aug 2018 16:48:40 +0200},
 title = {Scalable Linear Causal Inference for Irregularly Sampled Time Series with Long Range Dependencies},
 url = {http://arxiv.org/abs/1603.03336},
 volume = {abs/1603.03336},
 year = {2016}
}

Joseph E. Gonzalez, Peter Bailis, Michael I. Jordan, Michael J. Franklin, Joseph M. Hellerstein, Ali Ghodsi, and Ion Stoica. "Asynchronous Complex Analytics in a Distributed Dataflow Architecture." CoRR (arXiv), 2015.


@article{Gonzalez15,
 archiveprefix = {arXiv},
 author = {Joseph E. Gonzalez and Peter Bailis and Michael I. Jordan and Michael J. Franklin and Joseph M. Hellerstein and Ali Ghodsi and Ion Stoica},
 bdsk-url-1 = {http://arxiv.org/abs/1510.07092},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/bib/journals/corr/GonzalezBJFHGS15},
 date-modified = {2020-08-02 11:27:35 -0700},
 eprint = {1510.07092},
 journal = {CoRR},
 keywords = {arxivpre},
 timestamp = {Mon, 13 Aug 2018 16:46:22 +0200},
 title = {Asynchronous Complex Analytics in a Distributed Dataflow Architecture},
 url = {http://arxiv.org/abs/1510.07092},
 volume = {abs/1510.07092},
 year = {2015}
}

Veronika Strnadova-Neeley, Aydin Buluc, Jarrod Chapman, John Gilbert, Joseph E. Gonzalez, and Leonid Oliker. "Efficient Data Reduction for Large-Scale Genetic Mapping." ACM Conference on Bioinformatics, Computational Biology, and Health Informatics (BCB '15), 2015.


@inproceedings{bcb2015,
 author = {Veronika Strnadova-Neeley and Aydin Buluc and Jarrod Chapman and John Gilbert and Joseph E. Gonzalez and Leonid Oliker},
 booktitle = { {ACM} Conference on Bioinformatics, Computational Biology, and Health Informatics ({BCB} '15)},
 keywords = {peerrev},
 title = {Efficient Data Reduction for Large-Scale Genetic Mapping},
 year = {2015}
}

Neeraja J. Yadwadkar, Bharath Hariharan, Joseph E. Gonzalez, and Randy Katz. "Faster Jobs in Distributed Data Processing using Multi-Task Learning." SIAM International Conference on Data Mining (SDM '15), 2015.


@inproceedings{sdm15,
 author = {Neeraja J. Yadwadkar and Bharath Hariharan and Joseph E. Gonzalez and Randy Katz},
 booktitle = { {SIAM} International Conference on Data Mining ({SDM} '15)},
 keywords = {peerrev},
 title = {Faster Jobs in Distributed Data Processing using Multi-Task Learning},
 year = {2015}
}

Daniel Crankshaw, Xin Wang, Joseph E. Gonzalez, and Michael J. Franklin. "Scalable Training and Serving of Personalized Models." Proceedings of the Learning Systems Workshop at NIPS 2015, 2015.


@inproceedings{Velox15,
 author = {Daniel Crankshaw and Xin Wang and Joseph E. Gonzalez and Michael J. Franklin},
 booktitle = {Proceedings of the Learning Systems Workshop at NIPS 2015},
 keywords = {peerrev},
 title = {Scalable Training and Serving of Personalized Models},
 year = {2015}
}

Daniel Crankshaw, Peter Bailis, Joseph E. Gonzalez, Haoyuan Li, Zhao Zhang, Michael J. Franklin, Ali Ghodsi, and Michael I. Jordan. "The Missing Piece in Complex Analytics: Low Latency, Scalable Model Management and Serving with Velox." Conference on Innovative Data Systems Research (CIDR '15), 2015.


@inproceedings{VeloxCIDR15,
 author = {Daniel Crankshaw and Peter Bailis and Joseph E. Gonzalez and Haoyuan Li and Zhao Zhang and Michael J. Franklin and Ali Ghodsi and Michael I. Jordan},
 booktitle = {Conference on Innovative Data Systems Research ({CIDR} '15)},
 keywords = {peerrev},
 title = {The Missing Piece in Complex Analytics: Low Latency, Scalable Model Management and Serving with Velox},
 year = {2015}
}

Veronika Strnadova, Aydin Buluc, Leonid Oliker, Joseph E. Gonzalez, Stefanie Jegelka, Jarrod Chapman, and John Gilbert. "Fast Clustering Methods for Genetic Mapping in Plants." 16th SIAM Conference on Parallel Processing for Scientific Computing, 2014.


@inproceedings{GeneClust14,
 author = {Veronika Strnadova and Aydin Buluc and Leonid Oliker and Joseph E. Gonzalez and Stefanie Jegelka and Jarrod Chapman and John Gilbert},
 booktitle = {16th SIAM Conference on Parallel Processing for Scientific Computing},
 keywords = {peerrev},
 title = {Fast Clustering Methods for Genetic Mapping in Plants},
 year = {2014}
}

Joseph E. Gonzalez. "From Graphs to Tables the Design of Scalable Systems for Graph Analytics." Proceedings of the 23rd International Conference on World Wide Web, 2014.


@inproceedings{WWW2014,
 author = {Joseph E. Gonzalez},
 bdsk-url-1 = {https://doi.acm.org/10.1145/2567948.2580059},
 booktitle = {Proceedings of the 23rd International Conference on World Wide Web},
 date-modified = {2020-08-02 11:27:35 -0700},
 isbn = {978-1-4503-2745-9},
 keywords = {techreport},
 location = {Seoul, Korea},
 numpages = {2},
 pages = {1149--1150},
 publisher = {ACM},
 series = {WWW '14 Companion},
 title = {From Graphs to Tables the Design of Scalable Systems for Graph Analytics},
 url = {https://doi.acm.org/10.1145/2567948.2580059},
 year = {2014}
}

Joseph E. Gonzalez, Reynold S. Xin, Ankur Dave, Daniel Crankshaw, Michael J. Franklin, and Ion Stoica. "GraphX: Graph Processing in a Distributed Dataflow Framework." 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14), 2014.


@inproceedings{GraphX14,
 author = {Joseph E. Gonzalez and Reynold S. Xin and Ankur Dave and Daniel Crankshaw and Michael J. Franklin and Ion Stoica},
 booktitle = {11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14)},
 keywords = {peerrev},
 pages = {599--613},
 title = {GraphX: Graph Processing in a Distributed Dataflow Framework},
 year = {2014}
}

Xinghao Pan, Stefanie Jegelka, Joseph E. Gonzalez, Joseph K. Bradley, and Michael I. Jordan. "Parallel Double Greedy Submodular Maximization." Neural Information Processing Systems (NIPS '14), 2014.


@inproceedings{ParallelSubmodular14,
 author = {Xinghao Pan and Stefanie Jegelka and Joseph E. Gonzalez and Joseph K. Bradley and Michael I. Jordan},
 booktitle = {Neural Information Processing Systems ({NIPS} '14)},
 keywords = {peerrev},
 title = {Parallel Double Greedy Submodular Maximization},
 year = {2014}
}

David Bader, Ayd\in Bulu\cc, John Gilbert, Joseph E. Gonzalez, Jeremy Kepner, and Timothy Mattson. "The Graph BLAS effort and its implications for Exascale." SIAM Workshop on Exascale Applied Mathematics Challenges and Opportunities (EX14), 2014.


@inproceedings{gblas14,
 author = {David Bader and Ayd{\i}n Bulu\c{c} and John Gilbert and Joseph E. Gonzalez and Jeremy Kepner and Timothy Mattson},
 booktitle = {SIAM Workshop on Exascale Applied Mathematics Challenges and Opportunities (EX14)},
 keywords = {peerrev},
 title = {The Graph BLAS effort and its implications for Exascale},
 year = {2014}
}

T. Mattson, D. Bader, J. Berry, A. Buluc, J. Dongarra, C. Faloutsos, J. Feo, J. Gilbert, J. Gonzalez, B. Hendrickson, J. Kepner, C. Leiserson, A. Lumsdaine, D. Padua, S. Poole, S. Reinhardt, M. Stonebraker, S. Wallach, and A. Yoo. "Standards for graph algorithm primitives." 2013 IEEE High Performance Extreme Computing Conference (HPEC), 2013.

It is our view that the state of the art in constructing a large collection of graph algorithms in terms of linear algebraic operations is mature enough to support the emergence of a standard set of primitive building blocks. This paper is a position paper defining the problem and announcing our intention to launch an open effort to define this standard.

@inproceedings{Standards13,
 abstract = {It is our view that the state of the art in constructing a large collection of graph algorithms in terms of linear algebraic operations is mature enough to support the emergence of a standard set of primitive building blocks. This paper is a position paper defining the problem and announcing our intention to launch an open effort to define this standard.},
 author = {T. Mattson and D. Bader and J. Berry and A. Buluc and J. Dongarra and C. Faloutsos and J. Feo and J. Gilbert and J. Gonzalez and B. Hendrickson and J. Kepner and C. Leiserson and A. Lumsdaine and D. Padua and S. Poole and S. Reinhardt and M. Stonebraker and S. Wallach and A. Yoo},
 bdsk-url-1 = {https://doi.org/10.1109/HPEC.2013.6670338},
 booktitle = {2013 IEEE High Performance Extreme Computing Conference (HPEC)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {9},
 pages = {1--2},
 title = {Standards for graph algorithm primitives},
 url = {https://doi.org/10.1109/HPEC.2013.6670338},
 year = {2013}
}

Evan Sparks, Ameet Talwalkar, Virginia Smith, Xinghao Pan, Joseph E. Gonzalez, Tim Kraska, Michael I. Jordan, and Michael J. Franklin. "MLI: An API for Distributed Machine Learning." International Conference on Data Mining (ICDM), 2013.

MLI is an Application Programming Interface designed to address the challenges of building Machine Learning algorithms in a distributed setting based on data-centric computing. Its primary goal is to simplify the development of high-performance, scalable, distributed algorithms. Our initial results show that, relative to existing systems, this interface can be used to build distributed implementations of a wide variety of common Machine Learning algorithms with minimal complexity and highly competitive performance and scalability.

@inproceedings{MLI12,
 abstract = {MLI is an Application Programming Interface designed to address the challenges of building Machine Learning algorithms in a distributed setting based on data-centric computing. Its primary goal is to simplify the development of high-performance, scalable, distributed algorithms. Our initial results show that, relative to existing systems, this interface can be used to build distributed implementations of a wide variety of common Machine Learning algorithms with minimal complexity and highly competitive performance and scalability.},
 author = {Evan Sparks and Ameet Talwalkar and Virginia Smith and Xinghao Pan and Joseph E. Gonzalez and Tim Kraska and Michael I. Jordan and Michael J. Franklin},
 bdsk-url-1 = {https://ieeexplore.ieee.org/abstract/document/6729619},
 booktitle = {International Conference on Data Mining (ICDM)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {12},
 organization = {IEEE},
 title = { {MLI}: An API for Distributed Machine Learning},
 url = {https://ieeexplore.ieee.org/abstract/document/6729619},
 year = {2013}
}

Reynold Xin, Joseph E. Gonzalez, Michael Franklin, and Ion Stoica. "GraphX: A Resilient Distributed Graph System on Spark." SIGMOD Grades Workshop, 2013.

From social networks to targeted advertising, big graphs capture the structure in data and are central to recent advances in machine learning and data mining. Unfortunately, directly applying existing data-parallel tools to graph computation tasks can be cumbersome and inefficient. The need for intuitive, scalable tools for graph computation has lead to the development of new graph-parallel systems (e.g., Pregel, PowerGraph) which are designed to efficiently execute graph algorithms. Unfortunately, these new graph-parallel systems do not address the challenges of graph construction and transformation which are often just as problematic as the subsequent computation. Furthermore, existing graph-parallel systems provide limited fault-tolerance and support for interactive data mining. We introduce GraphX, which combines the advantages of both data-parallel and graph-parallel systems by efficiently expressing graph computation within the Spark data-parallel framework. We leverage new ideas in distributed graph representation to efficiently distribute graphs as tabular data-structures. Similarly, we leverage advances in data-flow systems to exploit in-memory computation and fault-tolerance. We provide powerful new operations to simplify graph construction and transformation. Using these primitives we implement the PowerGraph and Pregel abstractions in less than 20 lines of code. Finally, by exploiting the Scala foundation of Spark, we enable users to interactively load, transform, and compute on massive graphs.

@inproceedings{SigmodGraphX13,
 abstract = {
From social networks to targeted advertising, big graphs capture the structure in data and are central to recent advances in machine learning and data mining. Unfortunately, directly applying existing data-parallel tools to graph computation tasks can be cumbersome and inefficient. The need for intuitive, scalable tools for graph computation has lead to the development of new graph-parallel systems (e.g., Pregel, PowerGraph) which are designed to efficiently execute graph algorithms. Unfortunately, these new graph-parallel systems do not address the challenges of graph construction and transformation which are often just as problematic as the subsequent computation. Furthermore, existing graph-parallel systems provide limited fault-tolerance and support for interactive data mining.

We introduce GraphX, which combines the advantages of both data-parallel and graph-parallel systems by efficiently expressing graph computation within the Spark data-parallel framework. We leverage new ideas in distributed graph representation to efficiently distribute graphs as tabular data-structures. Similarly, we leverage advances in data-flow systems to exploit in-memory computation and fault-tolerance. We provide powerful new operations to simplify graph construction and transformation. Using these primitives we implement the PowerGraph and Pregel abstractions in less than 20 lines of code. Finally, by exploiting the Scala foundation of Spark, we enable users to interactively load, transform, and compute on massive graphs.
},
 author = {Reynold Xin and Joseph E. Gonzalez and Michael Franklin and Ion Stoica},
 bdsk-url-1 = {https://dl.acm.org/citation.cfm?id=2484427},
 booktitle = { {SIGMOD} Grades Workshop},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 title = {GraphX: A Resilient Distributed Graph System on Spark},
 url = {https://dl.acm.org/citation.cfm?id=2484427},
 year = {2013}
}

Xinghao Pan, Joseph E. Gonzalez, Stefanie Jegelka, Tamara Broderick, and Michael I. Jordan. "Optimistic Concurrency Control for Distributed Unsupervised Learning." NIPS '13, 2013.

Research on distributed machine learning algorithms has focused primarily on one of two extremes - algorithms that obey strict concurrency constraints or algorithms that obey few or no such constraints. We consider an intermediate alternative in which algorithms optimistically assume that conflicts are unlikely and if conflicts do arise a conflict-resolution protocol is invoked. We view this ``optimistic concurrency control'' paradigm as particularly appropriate for large-scale machine learning algorithms, particularly in the unsupervised setting. We demonstrate our approach in three problem areas: clustering, feature learning and online facility location. We evaluate our methods via large-scale experiments in a cluster computing environment.

@inproceedings{OCC13,
 abstract = {Research on distributed machine learning algorithms has focused primarily on one of two extremes - algorithms that obey strict concurrency constraints or algorithms that obey few or no such constraints. We consider an intermediate alternative in which algorithms optimistically assume that conflicts are unlikely and if conflicts do arise a conflict-resolution protocol is invoked. We view this ``optimistic concurrency control'' paradigm as particularly appropriate for large-scale machine learning algorithms, particularly in the unsupervised setting. We demonstrate our approach in three problem areas: clustering, feature learning and online facility location. We evaluate our methods via large-scale experiments in a cluster computing environment.},
 author = {Xinghao Pan and Joseph E. Gonzalez and Stefanie Jegelka and Tamara Broderick and Michael I. Jordan},
 bdsk-url-1 = {https://arxiv.org/abs/1307.8049},
 booktitle = { {NIPS} '13},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 title = {Optimistic Concurrency Control for Distributed Unsupervised Learning},
 url = {https://arxiv.org/abs/1307.8049},
 year = {2013}
}

Yucheng Low AND Joseph E. Gonzalez AND Aapo Kyrola AND Danny Bickson AND Carlos Guestrin AND Joseph M. Hellerstein. "Distributed GraphLab: A Framework for Machine Learning and Data Mining in the Cloud.." Proceedings of Very Large Data Bases (PVLDB), 2012.

While high-level data parallel frameworks, like MapReduce, simplify the design and implementation of large-scale data processing systems, they do not naturally or efficiently support many important data mining and machine learning algorithms and can lead to inefficient learning systems. To help fill this critical void, we introduced the GraphLab abstraction which naturally expresses asynchronous, dynamic, graph-parallel computation while ensuring data consistency and achieving a high degree of parallel performance in the shared-memory setting. In this paper, we extend the GraphLab framework to the substantially more challenging distributed setting while preserving strong data consistency guarantees. We develop graph based extensions to pipelined locking and data versioning to reduce network congestion and mitigate the effect of network latency. We also introduce fault tolerance to the GraphLab abstraction using the classic Chandy-Lamport snapshot algorithm and demonstrate how it can be easily implemented by exploiting the GraphLab abstraction itself. Finally, we evaluate our distributed implementation of the GraphLab abstraction on a large Amazon EC2 deployment and show 1-2 orders of magnitude performance gains over Hadoop-based implementations.

@inproceedings{DistGraphlab12,
 abstract = {While high-level data parallel frameworks, like MapReduce, simplify the design and implementation of large-scale data processing systems, they do not naturally or efficiently support many important data mining and machine learning algorithms and can lead to inefficient learning systems. To help fill this critical void, we introduced the GraphLab abstraction which naturally expresses asynchronous, dynamic, graph-parallel computation while ensuring data consistency and achieving a high degree of parallel performance in the shared-memory setting. In this paper, we extend the GraphLab framework to the substantially more challenging distributed setting while preserving strong data consistency guarantees. We develop graph based extensions to pipelined locking and data versioning to reduce network congestion and mitigate the effect of network latency. We also introduce fault tolerance to the GraphLab abstraction using the classic Chandy-Lamport snapshot algorithm and demonstrate how it can be easily implemented by exploiting the GraphLab abstraction itself. Finally, we evaluate our distributed implementation of the GraphLab abstraction on a large Amazon EC2 deployment and show 1-2 orders of magnitude performance gains over Hadoop-based implementations.},
 author = {Yucheng Low AND Joseph E. Gonzalez AND Aapo Kyrola AND Danny Bickson AND Carlos Guestrin AND Joseph M. Hellerstein},
 bdsk-url-1 = {https://arxiv.org/abs/1204.6078},
 booktitle = {Proceedings of Very Large Data Bases (PVLDB)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {8},
 title = {Distributed GraphLab: A Framework for Machine Learning and Data Mining in the Cloud.},
 url = {https://arxiv.org/abs/1204.6078},
 year = {2012}
}

Joseph E. Gonzalez, Yucheng Low, Haijie Gu, Danny Bickson, and Carlos Guestrin. "PowerGraph: Distributed Graph-Parallel Computation on Natural Graphs." OSDI '12, 2012.

Large-scale graph-structured computation is central to tasks ranging from targeted advertising to natural language processing and has led to the development of several graph-parallel abstractions including Pregel and GraphLab. However, the natural graphs commonly found in the real-world have highly skewed power-law degree distributions, which challenge the assumptions made by these abstractions, limiting performance and scalability. In this paper, we characterize the challenges of computation on natural graphs in the context of existing graphparallel abstractions. We then introduce the PowerGraph abstraction which exploits the internal structure of graph programs to address these challenges. Leveraging the PowerGraph abstraction we introduce a new approach to distributed graph placement and representation that exploits the structure of power-law graphs. We provide a detailed analysis and experimental evaluation comparing PowerGraph to two popular graph-parallel systems. Finally, we describe three different implementation strategies for PowerGraph and discuss their relative merits with empirical evaluations on large-scale real-world problems demonstrating order of magnitude gains.

@inproceedings{PowerGraph12,
 abstract = {
Large-scale graph-structured computation is central to tasks ranging from targeted advertising to natural language processing and has led to the development of several graph-parallel abstractions including Pregel and GraphLab. However, the natural graphs commonly found in the real-world have highly skewed power-law degree distributions, which challenge the assumptions made by these abstractions, limiting performance and scalability.

In this paper, we characterize the challenges of computation on natural graphs in the context of existing graphparallel abstractions. We then introduce the PowerGraph abstraction which exploits the internal structure of graph programs to address these challenges. Leveraging the PowerGraph abstraction we introduce a new approach to distributed graph placement and representation that exploits the structure of power-law graphs. We provide a detailed analysis and experimental evaluation comparing PowerGraph to two popular graph-parallel systems. Finally, we describe three different implementation strategies for PowerGraph and discuss their relative merits with empirical evaluations on large-scale real-world problems demonstrating order of magnitude gains.
},
 author = {Joseph E. Gonzalez and Yucheng Low and Haijie Gu and Danny Bickson and Carlos Guestrin},
 bdsk-url-1 = {https://www.usenix.org/system/files/conference/osdi12/osdi12-final-167.pdf},
 booktitle = { {OSDI} '12},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 title = {PowerGraph: Distributed Graph-Parallel Computation on Natural Graphs},
 url = {https://www.usenix.org/system/files/conference/osdi12/osdi12-final-167.pdf},
 year = {2012}
}

Amr Ahmed, Mohamed Aly, Joseph Gonzalez, Shravan Narayanamurthy, and Alex Smola. "Scalable Inference in Latent Variable Models." Conference on Web Search and Data Mining (WSDM), 2012.

Latent variable techniques are pivotal in tasks ranging from predicting user click patterns and targeting ads to organizing the news and managing user generated content. Latent variable techniques like topic modeling, clustering, and subspace estimation provide substantial insight into the latent structure of complex data with little or no external guidance making them ideal for reasoning about large-scale, rapidly evolving datasets. Unfortunately, due to the data dependencies and global state introduced by latent variables and the iterative nature of latent variable inference, latent-variable techniques are often prohibitively expensive to apply to large-scale, streaming datasets. In this paper we present a scalable parallel framework for efficient inference in latent variable models over streaming web-scale data. Our framework addresses three key challenges: 1) synchronizing the global state which includes global latent variables (e.g., cluster centers and dictionaries); 2) efficiently storing and retrieving the large local state which includes the data-points and their corresponding latent variables (e.g., cluster membership); and 3) sequentially incorporating streaming data (e.g., the news). We address these challenges by introducing: 1) a novel delta-based aggregation system with a bandwidth-efficient communication protocol; 2) schedule-aware out-of-core storage; and 3) approximate forward sampling to rapidly incorporate new data. We demonstrate state-of-the-art performance of our framework by easily tackling datasets two orders of magnitude larger than those addressed by the current state-of-the-art. Furthermore, we provide an optimized and easily customizable open-source implementation of the framework

@inproceedings{ParamServer12,
 abstract = {
Latent variable techniques are pivotal in tasks ranging from predicting user click patterns and targeting ads to organizing the news and managing user generated content. Latent variable techniques like topic modeling, clustering, and subspace estimation provide substantial insight into the latent structure of complex data with little or no external guidance making them ideal for reasoning about large-scale, rapidly evolving datasets. Unfortunately, due to the data dependencies and global state introduced by latent variables and the iterative nature of latent variable inference, latent-variable techniques are often prohibitively expensive to apply to large-scale, streaming datasets.

In this paper we present a scalable parallel framework for efficient inference in latent variable models over streaming web-scale data. Our framework addresses three key challenges: 1) synchronizing the global state which includes global latent variables (e.g., cluster centers and dictionaries); 2) efficiently storing and retrieving the large local state which includes the data-points and their corresponding latent variables (e.g., cluster membership); and 3) sequentially incorporating streaming data (e.g., the news). We address these challenges by introducing: 1) a novel delta-based aggregation system with a bandwidth-efficient communication protocol; 2) schedule-aware out-of-core storage; and 3) approximate forward sampling to rapidly incorporate new data. We demonstrate state-of-the-art performance of our framework by easily tackling datasets two orders of magnitude larger than those addressed by the current state-of-the-art. Furthermore, we provide an optimized and easily customizable open-source implementation of the framework
},
 author = {Amr Ahmed and Mohamed Aly and Joseph Gonzalez and Shravan Narayanamurthy and Alex Smola},
 bdsk-url-1 = {http://www.cs.cmu.edu/~jegonzal/papers/ahmed%5C%5Fscalable%5C%5Finference%5C%5Fin%5C%5Flatent%5C%5Fvariable%5C%5Fmodels.pdf},
 booktitle = {Conference on Web Search and Data Mining (WSDM)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 title = {Scalable Inference in Latent Variable Models},
 url = {http://www.cs.cmu.edu/~jegonzal/papers/ahmed\%5Fscalable\%5Finference\%5Fin\%5Flatent\%5Fvariable\%5Fmodels.pdf},
 year = {2012}
}

Joseph E. Gonzalez, Yucheng Low, Arthur Gretton, and Carlos Guestrin. "Parallel Gibbs Sampling: From Colored Fields to Thin Junction Trees." Artificial Intelligence and Statistics (AISTATS), 2011.

We explore the task of constructing a parallel Gibbs sampler, to both improve mixing and the exploration of high likelihood states. Recent work in parallel Gibbs sampling has focused on update schedules which do not guarantee convergence to the intended stationary distribution. In this work, we propose two methods to construct parallel Gibbs samplers guaranteed to draw from the targeted distribution. The first method, called the Chromatic sampler, uses graph coloring to construct a direct parallelization of the classic sequential scan Gibbs sampler. In the case of 2-colorable models we relate the Chromatic sampler to the Synchronous Gibbs sampler (which draws all variables simultaneously in parallel), and reveal new ergodic properties of Synchronous Gibbs chains. Our second method, the Splash sampler, is a complementary strategy which can be used when the variables are tightly coupled. This constructs and samples multiple blocks in parallel, using a novel locking protocol and an iterative junction tree generation algorithm. We further improve the Splash sampler through adaptive tree construction. We demonstrate the benefits of our two sampling algorithms on large synthetic and real-world models using a 32 processor multi-core system.

@inproceedings{GibbsSplash11,
 abstract = {We explore the task of constructing a parallel Gibbs sampler, to both improve mixing and the exploration of high likelihood states. Recent work in parallel Gibbs sampling has focused on update schedules which do not guarantee convergence to the intended stationary distribution. In this work, we propose two methods to construct parallel Gibbs samplers guaranteed to draw from the targeted distribution. The first method, called the Chromatic sampler, uses graph coloring to construct a direct parallelization of the classic sequential scan Gibbs sampler. In the case of 2-colorable models we relate the Chromatic sampler to the Synchronous Gibbs sampler (which draws all variables simultaneously in parallel), and reveal new ergodic properties of Synchronous Gibbs chains. Our second method, the Splash sampler, is a complementary strategy which can be used when the variables are tightly coupled. This constructs and samples multiple blocks in parallel, using a novel locking protocol and an iterative junction tree generation algorithm. We further improve the Splash sampler through adaptive tree construction. We demonstrate the benefits of our two sampling algorithms on large synthetic and real-world models using a 32 processor multi-core system.},
 author = {Joseph E. Gonzalez and Yucheng Low and Arthur Gretton and Carlos Guestrin},
 bdsk-url-1 = {http://proceedings.mlr.press/v15/gonzalez11a.html},
 booktitle = {Artificial Intelligence and Statistics (AISTATS)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {5},
 title = {Parallel Gibbs Sampling: From Colored Fields to Thin Junction Trees},
 url = {http://proceedings.mlr.press/v15/gonzalez11a.html},
 year = {2011}
}

Yucheng Low, Joseph E. Gonzalez, Aapo Kyrola, Daniel Bickson, Carlos Guestrin, and Joseph M. Hellerstein. "GraphLab: A New Parallel Framework for Machine Learning." Conference on Uncertainty in Artificial Intelligence (UAI), 2010.

Designing and implementing efficient, provably correct parallel machine learning (ML) algorithms is challenging. Existing high-level parallel abstractions like MapReduce are insufficiently expressive while low-level tools like MPI and Pthreads leave ML experts repeatedly solving the same design challenges. By targeting common patterns in ML, we developed GraphLab, which improves upon abstractions like MapReduce by compactly expressing asynchronous iterative algorithms with sparse computational dependencies while ensuring data consistency and achieving a high degree of parallel performance. We demonstrate the expressiveness of the GraphLab framework by designing and implementing parallel versions of belief propagation, Gibbs sampling, Co-EM, Lasso and Compressed Sensing. We show that using GraphLab we can achieve excellent parallel performance on large scale real-world problems.

@inproceedings{Graphlab10,
 abstract = {Designing and implementing efficient, provably correct parallel machine learning (ML) algorithms is challenging. Existing high-level parallel abstractions like MapReduce are insufficiently expressive while low-level tools like MPI and Pthreads leave ML experts repeatedly solving the same design challenges. By targeting common patterns in ML, we developed GraphLab, which improves upon abstractions like MapReduce by compactly expressing asynchronous iterative algorithms with sparse computational dependencies while ensuring data consistency and achieving a high degree of parallel performance. We demonstrate the expressiveness of the GraphLab framework by designing and implementing parallel versions of belief propagation, Gibbs sampling, Co-EM, Lasso and Compressed Sensing. We show that using GraphLab we can achieve excellent parallel performance on large scale real-world problems.},
 author = {Yucheng Low and Joseph E. Gonzalez and Aapo Kyrola and Daniel Bickson and Carlos Guestrin and Joseph M. Hellerstein},
 bdsk-url-1 = {https://arxiv.org/abs/1006.4990},
 booktitle = {Conference on Uncertainty in Artificial Intelligence (UAI)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 title = {GraphLab: A New Parallel Framework for Machine Learning},
 url = {https://arxiv.org/abs/1006.4990},
 year = {2010}
}

Joseph E. Gonzalez, Yucheng Low, Carlos Guestrin, and David O'Hallaron. "Distributed Parallel Inference on Large Factor Graphs." Conference on Uncertainty in Artificial Intelligence (UAI), 2009.

As computer clusters become more common and the size of the problems encountered in the field of AI grows, there is an increasing demand for efficient parallel inference algorithms. We consider the problem of parallel inference on large factor graphs in the distributed memory setting of computer clusters. We develop a new efficient parallel inference algorithm, DBRSplash, which incorporates over-segmented graph partitioning, belief residual scheduling, and uniform work Splash operations. We empirically evaluate the DBRSplash algorithm on a 120 processor cluster and demonstrate linear to super-linear performance gains on large factor graph models.

@inproceedings{DistSplash09,
 abstract = {As computer clusters become more common and the size of the problems encountered in the field of AI grows, there is an increasing demand for efficient parallel inference algorithms. We consider the problem of parallel inference on large factor graphs in the distributed memory setting of computer clusters. We develop a new efficient parallel inference algorithm, DBRSplash, which incorporates over-segmented graph partitioning, belief residual scheduling, and uniform work Splash operations. We empirically evaluate the DBRSplash algorithm on a 120 processor cluster and demonstrate linear to super-linear performance gains on large factor graph models.},
 author = {Joseph E. Gonzalez and Yucheng Low and Carlos Guestrin and David O'Hallaron},
 bdsk-url-1 = {https://arxiv.org/pdf/1205.2645.pdf},
 booktitle = {Conference on Uncertainty in Artificial Intelligence (UAI)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {7},
 title = {Distributed Parallel Inference on Large Factor Graphs},
 url = {https://arxiv.org/pdf/1205.2645.pdf},
 year = {2009}
}

Joseph E. Gonzalez, Yucheng Low, and Carlos Guestrin. "Residual Splash for Optimally Parallelizing Belief Propagation." Artificial Intelligence and Statistics (AISTATS), 2009.

As computer architectures move towards parallelism we must build a new theoretical understanding of parallelism in machine learning. In this paper we focus on parallelizing message passing inference algorithms in graphical models. We develop a theoretical understanding of the limitations of parallelism in belief propagation and bound the optimal achievable running parallel performance on a certain class of graphical models. We demonstrate that the fully synchronous parallelization of belief propagation is highly inefficient. We provide a new parallel belief propagation which achieves optimal performance on a certain class of graphical models. Using two challenging real-world problems, we empirically evaluate the performance of our algorithm. On the real-world problems, we find that our new algorithm achieves near linear performance improvements and out performs alternative parallel belief propagation algorithms.

@inproceedings{ParallelSplash09,
 abstract = {As computer architectures move towards parallelism we must build a new theoretical understanding of parallelism in machine learning. In this paper we focus on parallelizing message passing inference algorithms in graphical models. We develop a theoretical understanding of the limitations of parallelism in belief propagation and bound the optimal achievable running parallel performance on a certain class of graphical models. We demonstrate that the fully synchronous parallelization of belief propagation is highly inefficient. We provide a new parallel belief propagation which achieves optimal performance on a certain class of graphical models. Using two challenging real-world problems, we empirically evaluate the performance of our algorithm. On the real-world problems, we find that our new algorithm achieves near linear performance improvements and out performs alternative parallel belief propagation algorithms.},
 author = {Joseph E. Gonzalez and Yucheng Low and Carlos Guestrin},
 bdsk-url-1 = {http://proceedings.mlr.press/v5/gonzalez09a.html},
 booktitle = {Artificial Intelligence and Statistics (AISTATS)},
 date-modified = {2020-08-02 11:27:35 -0700},
 keywords = {peerrev},
 month = {4},
 title = {Residual Splash for Optimally Parallelizing Belief Propagation},
 url = {http://proceedings.mlr.press/v5/gonzalez09a.html},
 year = {2009}
}