@inproceedings{10.1145/3563657.3596138,
author = {Zamfirescu-Pereira, J.D. and Wei, Heather and Xiao, Amy and Gu, Kitty and Jung, Grace and Lee, Matthew G and Hartmann, Bjoern and Yang, Qian},
title = {Herding AI Cats: Lessons from Designing a Chatbot by Prompting GPT-3},
year = {2023},
isbn = {9781450398930},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3563657.3596138},
doi = {10.1145/3563657.3596138},
abstract = {Prompting Large Language Models (LLMs) is an exciting new approach to designing chatbots. But can it improve LLMâ€™s user experience (UX) reliably enough to power chatbot products? Our attempt to design a robust chatbot by prompting GPT-3/4 alone suggests: not yet. Prompts made achieving â€œ80\%â€ UX goals easy, but not the remaining 20\%. Fixing the few remaining interaction breakdowns resembled herding cats: We could not address one UX issue or test one design solution at a time; instead, we had to handle everything everywhere all at once. Moreover, because no prompt could make GPT reliably say â€œI donâ€™t knowâ€ when it should, the user-GPT conversations had no guardrails after a breakdown occurred, often leading to UX downward spirals. These risks incentivized us to design highly prescriptive prompts and scripted bots, counter to the promises of LLM-powered chatbots. This paper describes this case study, unpacks promptingâ€™s fickleness and its impact on UX design processes, and discusses implications for LLM-based design methods and tools.},
booktitle = {Proceedings of the 2023 ACM Designing Interactive Systems Conference},
pages = {2206â€“2220},
numpages = {15},
keywords = {UX, conversational user interface, Prompt engineering, GPT.},
location = {Pittsburgh, PA, USA},
series = {DIS '23}
}